load libraries

INDEX******************************************************************************************************************

Figure 1. a. (left) 407, stats 335 (right) 245 b. 2785, stats 2862 c. 2953 d. 2625

Figure 2. a. 730 stats, 611 b. 1262, stats 1144 c. 2785, stats 2862 d. 2625 e. 3279 f. right 3328; left 3305

Figure S3. 460

Figure S4. a. 945 b. 1422 c. 1023

Figure S5. a. 1222 b. 1317 c. 911

Figure S6. a. 1463 b. 1526 c. 1624

Figure 3. b. 1979 c. 2231 f. 2784, stats 2862 g. 2625

Figure 4. a. 2902, stats 2883 b. left 3048; deviance 3070; right 2953
c. 2625, stats 2743

Figure 5. a. 4365 b. 2841, stats 2862 c. 2962, 3464 d. 3159 e. left 2700, right 3493 f. 3440, 3411

Figure 6. a. 3790 b. 4501 c. 4270, 4231, 3950 d. 3790, 3950, 4028 e. 4115, 4155, 4221; stats 4073, 4222

PRS****************************************************************************************************************

Input the PRS data

# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array_for_manuscript_fixed.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)

Set color palette

group_colors  <- c("grey","#3C8C78") 

Define your list of traits to analyze

# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
##  [1] "Heart_rate"     "LQTS"           "PR_interval"    "QTc"           
##  [5] "Afib"           "HRV"            "BP"             "QRS"           
##  [9] "HCM"            "LVEDV"          "LVEDVI"         "LVEF"          
## [13] "LVESV"          "LVESVI"         "SV"             "SVI"           
## [17] "GGE"            "Focal_Epilepsy"

Z normalize the scores

prs_data <- prs_data %>%
  mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))

define the cases and controls

group_definitions <- list(
  "Control" = c(1),
  "Case"= c(2,3,4,5,6)
)

Format the data

# Categorize arrest_ontology into specified groups
prs_data <- prs_data %>%
  mutate(arrest_group = case_when(
    arrest_ontology %in% group_definitions[["Control"]] ~ "Control",
    arrest_ontology %in% group_definitions[["Case"]] ~ "Case",
    TRUE ~ as.character(arrest_ontology)
  ))

# Filter out only the relevant groups for the plot
relevant_groups <- names(group_definitions)
filtered_data <- prs_data %>%
  filter(arrest_group %in% relevant_groups)

# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")

# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]

# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- reshape2::melt(filtered_data, 
                               id.vars = c('arrest_group'), 
                               measure.vars = existing_normalized_traits)

Split into discovery or replication

prs_data_discovery <- filtered_data %>%
  filter(Set == "Discovery")

prs_data_replication <- filtered_data %>%
  filter(Set == "Replication")

Make the Correlation plot

# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]

#order the traits
corr_traits <- c(
  "Normalized_QRS", 
  "Normalized_BP",
  "Normalized_SVI",
  "Normalized_LVESV", 
  "Normalized_PR_interval", 
  "Normalized_HRV", 
  "Normalized_GGE", 
  "Normalized_LVEDV",
  "Normalized_LVESVI", 
  "Normalized_Focal_Epilepsy", 
  "Normalized_HCM", 
  "Normalized_LQTS", 
  "Normalized_Afib", 
  "Normalized_LVEDVI", 
  "Normalized_SV", 
  "Normalized_Heart_rate", 
  "Normalized_QTc", 
  "Normalized_LVEF"
)


# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]

# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")

#Plot correlation
corrplot(cor_matrix_ordered, 
         method = "color",  
         type = "full",
         tl.col = "black",
         tl.srt = 45,
         cl.ratio = 0.2,
         col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
         diag = FALSE,
         addgrid.col = "black"  
)

#Show the results
correlation_results <- rcorr(as.matrix(data_for_correlation_ordered), type = "pearson")
correlation_results$P
##                           Normalized_QRS Normalized_BP Normalized_SVI
## Normalized_QRS                        NA  9.933678e-01   6.853895e-01
## Normalized_BP               9.933678e-01            NA   1.671754e-06
## Normalized_SVI              6.853895e-01  1.671754e-06             NA
## Normalized_LVESV            4.734779e-05  1.440789e-01   5.176039e-08
## Normalized_PR_interval      6.161903e-03  1.510868e-01   3.070402e-01
## Normalized_HRV              5.899446e-06  3.828959e-01   7.641050e-07
## Normalized_GGE              9.659843e-02  1.270667e-01   7.128341e-03
## Normalized_LVEDV            4.954003e-02  1.918542e-01   4.352831e-03
## Normalized_LVESVI           6.277110e-04  5.726864e-01   5.992465e-01
## Normalized_Focal_Epilepsy   9.955037e-02  3.021332e-01   2.018146e-01
## Normalized_HCM              2.396194e-02  7.604039e-01   2.522892e-02
## Normalized_LQTS             5.021915e-02  2.378957e-01   3.264184e-02
## Normalized_Afib             2.837904e-01  7.458152e-01   7.488965e-04
## Normalized_LVEDVI           4.291581e-02  2.605060e-03   1.172185e-05
## Normalized_SV               7.958341e-01  5.007362e-03   1.326023e-03
## Normalized_Heart_rate       7.704693e-04  7.404989e-02   0.000000e+00
## Normalized_QTc              3.611700e-01  3.700776e-05   1.148859e-12
## Normalized_LVEF             1.031027e-01  1.295488e-01   7.635401e-06
##                           Normalized_LVESV Normalized_PR_interval
## Normalized_QRS                4.734779e-05            0.006161903
## Normalized_BP                 1.440789e-01            0.151086783
## Normalized_SVI                5.176039e-08            0.307040155
## Normalized_LVESV                        NA            0.279786899
## Normalized_PR_interval        2.797869e-01                     NA
## Normalized_HRV                1.727377e-03            0.183954907
## Normalized_GGE                2.297310e-01            0.926572064
## Normalized_LVEDV              0.000000e+00            0.326133747
## Normalized_LVESVI             0.000000e+00            0.977482753
## Normalized_Focal_Epilepsy     3.413192e-01            0.873919485
## Normalized_HCM                0.000000e+00            0.779477674
## Normalized_LQTS               4.128755e-02            0.130976701
## Normalized_Afib               5.958125e-04            0.092859152
## Normalized_LVEDVI             0.000000e+00            0.837999134
## Normalized_SV                 6.087284e-09            0.126695164
## Normalized_Heart_rate         9.201374e-03            0.151032458
## Normalized_QTc                6.331019e-01            0.389631858
## Normalized_LVEF               0.000000e+00            0.257495908
##                           Normalized_HRV Normalized_GGE Normalized_LVEDV
## Normalized_QRS              5.899446e-06   9.659843e-02     4.954003e-02
## Normalized_BP               3.828959e-01   1.270667e-01     1.918542e-01
## Normalized_SVI              7.641050e-07   7.128341e-03     4.352831e-03
## Normalized_LVESV            1.727377e-03   2.297310e-01     0.000000e+00
## Normalized_PR_interval      1.839549e-01   9.265721e-01     3.261337e-01
## Normalized_HRV                        NA   1.417508e-05     7.232691e-01
## Normalized_GGE              1.417508e-05             NA     4.327716e-01
## Normalized_LVEDV            7.232691e-01   4.327716e-01               NA
## Normalized_LVESVI           4.718379e-01   3.926784e-01     0.000000e+00
## Normalized_Focal_Epilepsy   2.515402e-01   9.696364e-01     1.792096e-01
## Normalized_HCM              2.268539e-01   7.662938e-02     9.463941e-11
## Normalized_LQTS             9.602938e-01   6.911754e-01     5.693882e-05
## Normalized_Afib             6.384594e-01   5.193028e-02     4.874951e-01
## Normalized_LVEDVI           3.163305e-01   7.224922e-02     0.000000e+00
## Normalized_SV               5.701852e-01   3.181255e-01     0.000000e+00
## Normalized_Heart_rate       1.551204e-12   1.646706e-01     1.215043e-03
## Normalized_QTc              6.050812e-03   9.795078e-03     9.106381e-03
## Normalized_LVEF             1.320877e-02   3.297949e-01     1.024596e-07
##                           Normalized_LVESVI Normalized_Focal_Epilepsy
## Normalized_QRS                  0.000627711                0.09955037
## Normalized_BP                   0.572686369                0.30213319
## Normalized_SVI                  0.599246487                0.20181462
## Normalized_LVESV                0.000000000                0.34131916
## Normalized_PR_interval          0.977482753                0.87391948
## Normalized_HRV                  0.471837901                0.25154018
## Normalized_GGE                  0.392678433                0.96963636
## Normalized_LVEDV                0.000000000                0.17920961
## Normalized_LVESVI                        NA                0.30285180
## Normalized_Focal_Epilepsy       0.302851796                        NA
## Normalized_HCM                  0.000000000                0.95432712
## Normalized_LQTS                 0.022630206                0.59571578
## Normalized_Afib                 0.617610997                0.90444909
## Normalized_LVEDVI               0.000000000                0.70452838
## Normalized_SV                   0.016279998                0.71995021
## Normalized_Heart_rate           0.285679168                0.19226455
## Normalized_QTc                  0.071460435                0.97448035
## Normalized_LVEF                 0.000000000                0.87505014
##                           Normalized_HCM Normalized_LQTS Normalized_Afib
## Normalized_QRS              2.396194e-02    5.021915e-02    0.2837904158
## Normalized_BP               7.604039e-01    2.378957e-01    0.7458151593
## Normalized_SVI              2.522892e-02    3.264184e-02    0.0007488965
## Normalized_LVESV            0.000000e+00    4.128755e-02    0.0005958125
## Normalized_PR_interval      7.794777e-01    1.309767e-01    0.0928591521
## Normalized_HRV              2.268539e-01    9.602938e-01    0.6384594227
## Normalized_GGE              7.662938e-02    6.911754e-01    0.0519302805
## Normalized_LVEDV            9.463941e-11    5.693882e-05    0.4874950835
## Normalized_LVESVI           0.000000e+00    2.263021e-02    0.6176109965
## Normalized_Focal_Epilepsy   9.543271e-01    5.957158e-01    0.9044490887
## Normalized_HCM                        NA    2.747912e-01    0.2051656313
## Normalized_LQTS             2.747912e-01              NA    0.6090492099
## Normalized_Afib             2.051656e-01    6.090492e-01              NA
## Normalized_LVEDVI           3.149592e-06    5.104404e-06    0.8024317643
## Normalized_SV               5.644828e-01    2.725126e-06    0.7848328230
## Normalized_Heart_rate       7.267055e-01    2.228441e-01    0.0174432137
## Normalized_QTc              9.979204e-01    0.000000e+00    0.1904777957
## Normalized_LVEF             0.000000e+00    5.014860e-01    0.0219999301
##                           Normalized_LVEDVI Normalized_SV Normalized_Heart_rate
## Normalized_QRS                 4.291581e-02  7.958341e-01          7.704693e-04
## Normalized_BP                  2.605060e-03  5.007362e-03          7.404989e-02
## Normalized_SVI                 1.172185e-05  1.326023e-03          0.000000e+00
## Normalized_LVESV               0.000000e+00  6.087284e-09          9.201374e-03
## Normalized_PR_interval         8.379991e-01  1.266952e-01          1.510325e-01
## Normalized_HRV                 3.163305e-01  5.701852e-01          1.551204e-12
## Normalized_GGE                 7.224922e-02  3.181255e-01          1.646706e-01
## Normalized_LVEDV               0.000000e+00  0.000000e+00          1.215043e-03
## Normalized_LVESVI              0.000000e+00  1.628000e-02          2.856792e-01
## Normalized_Focal_Epilepsy      7.045284e-01  7.199502e-01          1.922646e-01
## Normalized_HCM                 3.149592e-06  5.644828e-01          7.267055e-01
## Normalized_LQTS                5.104404e-06  2.725126e-06          2.228441e-01
## Normalized_Afib                8.024318e-01  7.848328e-01          1.744321e-02
## Normalized_LVEDVI                        NA  0.000000e+00          2.187902e-01
## Normalized_SV                  0.000000e+00            NA          1.209219e-02
## Normalized_Heart_rate          2.187902e-01  1.209219e-02                    NA
## Normalized_QTc                 1.142112e-03  4.917063e-06          2.184145e-02
## Normalized_LVEF                3.606922e-03  1.979529e-02          1.680620e-02
##                           Normalized_QTc Normalized_LVEF
## Normalized_QRS              3.611700e-01    1.031027e-01
## Normalized_BP               3.700776e-05    1.295488e-01
## Normalized_SVI              1.148859e-12    7.635401e-06
## Normalized_LVESV            6.331019e-01    0.000000e+00
## Normalized_PR_interval      3.896319e-01    2.574959e-01
## Normalized_HRV              6.050812e-03    1.320877e-02
## Normalized_GGE              9.795078e-03    3.297949e-01
## Normalized_LVEDV            9.106381e-03    1.024596e-07
## Normalized_LVESVI           7.146043e-02    0.000000e+00
## Normalized_Focal_Epilepsy   9.744803e-01    8.750501e-01
## Normalized_HCM              9.979204e-01    0.000000e+00
## Normalized_LQTS             0.000000e+00    5.014860e-01
## Normalized_Afib             1.904778e-01    2.199993e-02
## Normalized_LVEDVI           1.142112e-03    3.606922e-03
## Normalized_SV               4.917063e-06    1.979529e-02
## Normalized_Heart_rate       2.184145e-02    1.680620e-02
## Normalized_QTc                        NA    8.490931e-02
## Normalized_LVEF             8.490931e-02              NA

Test for ANOVA assumptions

check_anova_assumptions <- function(data, trait) {
  # Ensure 'arrest_group' is a factor
  data$arrest_group <- as.factor(data$arrest_group)
  
  # Fit the ANOVA model
  formula <- as.formula(paste(trait, "~ arrest_group"))
  anova_model <- aov(formula, data = data)
  
  # Extract residuals
  residuals <- residuals(anova_model)
  
  # Shapiro-Wilk test for normality
  shapiro_test <- shapiro.test(residuals)
  shapiro_p_value <- shapiro_test$p.value
  
  # Levene's Test for homogeneity of variances
  levene_test <- leveneTest(formula, data = data)
  levene_p_value <- levene_test$`Pr(>F)`[1]
  
  # Bartlett's Test for homogeneity of variances
  bartlett_test <- bartlett.test(formula, data = data)
  bartlett_p_value <- bartlett_test$p.value
  
  # Create a summary table with the test results
  data.frame(
    Trait = trait,
    Shapiro_Wilk_p_value = shapiro_p_value,
    Levene_p_value = levene_p_value,
    Bartlett_p_value = bartlett_p_value
  )
}

anova_assumptions_results <- lapply(normalized_traits, function(trait) check_anova_assumptions(prs_data_discovery, trait))

# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)

# Print the results
print(anova_assumptions_df)
##                        Trait Shapiro_Wilk_p_value Levene_p_value
## 1      Normalized_Heart_rate         3.670786e-05   3.402430e-01
## 2            Normalized_LQTS         8.582805e-01   8.524520e-01
## 3     Normalized_PR_interval         9.626637e-02   7.067574e-01
## 4             Normalized_QTc         1.378815e-07   4.084220e-01
## 5            Normalized_Afib         5.252293e-01   4.766202e-01
## 6             Normalized_HRV         2.678043e-13   1.221016e-01
## 7              Normalized_BP         2.145452e-02   8.637621e-01
## 8             Normalized_QRS         1.508077e-39   2.042997e-06
## 9             Normalized_HCM         3.144379e-03   9.212937e-01
## 10          Normalized_LVEDV         1.331310e-01   2.218475e-04
## 11         Normalized_LVEDVI         5.919657e-02   7.738241e-03
## 12           Normalized_LVEF         2.471126e-02   3.111787e-01
## 13          Normalized_LVESV         1.169793e-02   9.853473e-01
## 14         Normalized_LVESVI         7.239432e-04   5.086718e-01
## 15             Normalized_SV         1.118507e-05   5.214914e-01
## 16            Normalized_SVI         8.610508e-04   8.519458e-01
## 17            Normalized_GGE         4.448074e-02   6.815913e-01
## 18 Normalized_Focal_Epilepsy         3.025018e-03   6.100802e-01
##    Bartlett_p_value
## 1      6.287623e-02
## 2      7.465532e-01
## 3      8.498751e-01
## 4      5.892923e-01
## 5      5.905392e-01
## 6      6.549201e-02
## 7      8.042278e-01
## 8      1.040405e-24
## 9      8.033151e-01
## 10     9.995313e-04
## 11     1.890503e-02
## 12     4.997414e-01
## 13     8.378271e-01
## 14     2.967646e-01
## 15     6.599025e-01
## 16     6.489996e-01
## 17     5.545373e-01
## 18     4.610240e-01

Since some do not meet ANOVA criteria, we go with nonparametric

# Define Wilcoxon Rank Sum test function with median difference calculation
perform_wilcoxon_test <- function(data, trait) {
  # Convert data to appropriate format
  data <- data %>%
    dplyr::select(arrest_group, all_of(trait)) %>%
    drop_na()

  # Calculate medians for each group
  group_medians <- data %>%
    group_by(arrest_group) %>%
    summarise(median_value = median(!!sym(trait)))

  # Calculate the difference in medians
  diff_median <- diff(group_medians$median_value)
  
  # Perform Wilcoxon rank-sum test
  wilcoxon_test_result <- wilcox_test(data, as.formula(paste(trait, "~ arrest_group")))

  # Extract p-value
  p_value <- wilcoxon_test_result$p

  # Adjust p-value using Bonferroni correction
  p_adjusted <- p.adjust(p_value, method = "bonferroni", n = 18)

  # Create a data frame with the test results
  result_df <- data.frame(
    Trait = trait,
    p_value = p_value,
    p_adjusted = p_adjusted,
    significant = p_adjusted < 0.05,
    diff_median = diff_median  
  )

  return(result_df)
}

# Perform Wilcoxon test for each trait and store results
wilcoxon_results <- lapply(normalized_traits, function(trait) perform_wilcoxon_test(prs_data_discovery, trait))

# Combine all Wilcoxon results into a single data frame
combined_wilcoxon_results <- do.call(rbind, wilcoxon_results)

# Print combined Wilcoxon results with p-values and median differences
print("Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:")
## [1] "Wilcoxon Rank Sum Test Results with Manual P-value Adjustments and Median Differences:"
print(combined_wilcoxon_results)
##                        Trait  p_value p_adjusted significant  diff_median
## 1      Normalized_Heart_rate 1.24e-02 2.2320e-01       FALSE  0.088165794
## 2            Normalized_LQTS 1.91e-01 1.0000e+00       FALSE  0.112902629
## 3     Normalized_PR_interval 2.34e-02 4.2120e-01       FALSE -0.160790733
## 4             Normalized_QTc 4.67e-03 8.4060e-02       FALSE  0.179095083
## 5            Normalized_Afib 1.07e-01 1.0000e+00       FALSE  0.080017842
## 6             Normalized_HRV 1.51e-01 1.0000e+00       FALSE -0.089874654
## 7              Normalized_BP 1.21e-06 2.1780e-05        TRUE -0.275654610
## 8             Normalized_QRS 1.12e-08 2.0160e-07        TRUE -0.128954882
## 9             Normalized_HCM 4.76e-01 1.0000e+00       FALSE  0.031802432
## 10          Normalized_LVEDV 6.38e-01 1.0000e+00       FALSE -0.044351499
## 11         Normalized_LVEDVI 7.37e-02 1.0000e+00       FALSE  0.118958269
## 12           Normalized_LVEF 1.76e-03 3.1680e-02        TRUE  0.124406676
## 13          Normalized_LVESV 7.02e-03 1.2636e-01       FALSE -0.124673369
## 14         Normalized_LVESVI 9.49e-01 1.0000e+00       FALSE -0.011000752
## 15             Normalized_SV 3.55e-02 6.3900e-01       FALSE  0.146894687
## 16            Normalized_SVI 1.77e-04 3.1860e-03        TRUE -0.360642163
## 17            Normalized_GGE 3.20e-01 1.0000e+00       FALSE -0.061833861
## 18 Normalized_Focal_Epilepsy 9.01e-01 1.0000e+00       FALSE  0.001265513

Make the lolliplot

# Define the groups for metrics and syndromes
metrics <- c("Normalized_Heart_rate", "Normalized_PR_interval", "Normalized_QTc", "Normalized_HRV", "Normalized_BP", "Normalized_QRS",
             "Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI")
syndromes <- c("Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")

# Categorize each trait as 'Metrics' or 'Syndromes'
combined_wilcoxon_results$Category <- ifelse(combined_wilcoxon_results$Trait %in% metrics, "Metrics",
                                             ifelse(combined_wilcoxon_results$Trait %in% syndromes, "Syndromes", "Combined"))

# Reorder the levels of Trait based on the Category
combined_wilcoxon_results <- combined_wilcoxon_results %>%
    mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))

# Transform P-values to -log10 scale
combined_wilcoxon_results$NegLogP <- -log10(combined_wilcoxon_results$p_value)

# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)
threshold2 <- -log10(0.05/18)

# Create a new variable for coloring based on P-value significance (already computed in the original df)
combined_wilcoxon_results$Significant <- ifelse(combined_wilcoxon_results$p_value < 0.05, "Significant", "Not Significant")

# Combine metrics and syndromes into a single ordered list
ordered_traits <- c(
  "Normalized_QRS", 
  "Normalized_BP",
  "Normalized_SVI",
  "Normalized_LVESV", 
  "Normalized_PR_interval", 
  "Normalized_HRV", 
  "Normalized_GGE", 
  "Normalized_LVEDV",
  "Normalized_LVESVI", 
  "Normalized_Focal_Epilepsy", 
  "Normalized_HCM", 
  "Normalized_LQTS", 
  "Normalized_Afib", 
  "Normalized_LVEDVI", 
  "Normalized_SV", 
  "Normalized_Heart_rate", 
  "Normalized_QTc", 
  "Normalized_LVEF"
)

# Update the 'Trait' column to have this specific order
combined_wilcoxon_results$Trait <- factor(combined_wilcoxon_results$Trait, levels = ordered_traits)

# Create the plot, coloring by diff_median
p3 <- ggplot(combined_wilcoxon_results, aes(x = Trait, y = NegLogP, color = diff_median)) +
    geom_segment(aes(xend = Trait, yend = 0), size = 1) +  
    geom_point(size = 3) +
    geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +  
    geom_hline(yintercept = threshold2, linetype = "dotted", color = "Red") + 
    scale_color_gradient2(low = "#3C8C78", mid = "white", high = "black", midpoint = 0) +  
    theme_cowplot() +  
    theme(axis.text.x = element_text(angle = 90, hjust = 1),  
          strip.text.x = element_text(size = 10)) +  
    labs(y = "-log10(P-value)", x = "Trait", color = "Median Difference", title = "Lollipop Plot of -log10(P-values) by Trait")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(p3)

# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
  # Rank the scores for the specified trait
  data$rank <- rank(data[[trait]], na.last = "keep")
  
  # Density plot for the trait
  density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = after_stat(density))) +
    geom_density(alpha = 0.6) +
    scale_fill_manual(values = group_colors) +
    labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Relative Density") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # CDF plot for the trait
  cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
    stat_ecdf(geom = "step", size = 1) +
    scale_color_manual(values = group_colors) +
    labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
         x = paste("Overall Rank of", trait, "Scores"),
         y = "Cumulative Proportion") +
    theme_cowplot(12) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  
  # Combine the plots
  combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
  
  # Return the combined plot
  return(combined_plot)
}


# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")

# Initialize an empty list to store plots
plots_list <- list()

# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
  plots_list[[trait]] <- generate_plots_for_trait(prs_data_discovery, trait, group_colors)
}

######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])

print(plots_list[["BP"]])

Coding regions******************************************************************************************************************

Bring in data

coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication_for_manuscript_fixed_NEW.txt", header = TRUE, sep = "\t")

coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]

Categorize the data

categorized_coding_stats_discovery <- coding_stats_discovery %>%
  mutate(Category_Group = case_when(
    Category == 1 ~ "Control",
    Category %in% 2:6 ~ "Case"
  ))

Merge some of the column data to get the desired allele frequency intervals

categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  mutate(
    Epilepsy_01_00001 = Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001,
    Epilepsy_00001 = Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
    Epilepsy_total_missense = Epilepsy_missense_common + Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 +
                              Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton,
    
    Cardiomyopathy_01_00001 = Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001,
    Cardiomyopathy_00001 = Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton,
    Cardiomyopathy_total_missense = Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 +
                                    Cardiomyopathy_missense_variant_0.001 + Cardiomyopathy_missense_variant_0.00001 +
                                    Cardiomyopathy_missense_singleton
  )

# Clean up the data: remove 'NA's, 'Inf's, and select only needed columns
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
  dplyr::select(-ID, -Category) %>%
  na.omit() %>%
  dplyr::filter(if_all(everything(), ~ !is.infinite(.)))

# Aggregate the data to compute mean and standard deviation
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# Clean up the collapsed data
collapsed_coding_stats <- na.omit(collapsed_coding_stats)

Perform permutation simulation and test

# List of coding variables to analyze
coding_variables_to_analyze <- c("Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost", 
                                 "Cardiomyopathy_stop_gained", "Cardiomyopathy_01_00001", 
                                 "Cardiomyopathy_00001", "Cardiomyopathy_total_missense", "PLP_Cardiomyopathy",
                                 "Epilepsy_HIGH", "Epilepsy_start_lost", 
                                 "Epilepsy_stop_gained", "Epilepsy_01_00001", 
                                 "Epilepsy_00001", "Epilepsy_total_missense", "PLP_Epilepsy")

# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
  
  # Check for sufficient group sizes
  if (sum(data[[group_var]] == "Case", na.rm = TRUE) > 1 && 
      sum(data[[group_var]] == "Control", na.rm = TRUE) > 1) {
    
    # Calculate the observed difference in means between the two groups
    observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"], na.rm = TRUE) - 
                         mean(data[[var]][data[[group_var]] == "Control"], na.rm = TRUE))
    
    # Create an empty vector to store the permutation statistics
    perm_stats <- numeric(num_permutations)
    
    # Perform the permutations
    for (i in 1:num_permutations) {
      # Randomly shuffle the group labels
      permuted_group <- sample(data[[group_var]])
      
      # Calculate the difference in means for the permuted data
      permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"], na.rm = TRUE) - 
                           mean(data[[var]][permuted_group == "Control"], na.rm = TRUE))
      
      # Store the permuted statistic
      perm_stats[i] <- permuted_stat
    }
    
    # Calculate the p-value (proportion of permuted stats >= observed stat)
    p_value <- mean(perm_stats >= observed_stat)
    
    return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
    
  } else {
    warning(paste("Skipping variable", var, "due to insufficient group size"))
    return(NULL)
  }
}

# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

# Loop through each variable and perform the permutation test
for (var in coding_variables_to_analyze) {
  # Run the permutation test
  test_result <- permutation_test(categorized_coding_stats_discovery, var, "Category_Group")
  
  if (!is.null(test_result)) {
    # Extract the permuted statistics, observed statistic, and p-value
    perm_stats <- test_result$perm_stats
    observed_stat <- test_result$observed_stat
    p_value <- test_result$p_value
    
    # Store the p-value and observed statistic in the dataframe
    p_value_results <- rbind(p_value_results, data.frame(Variable = var, 
                                                         Observed_Stat = observed_stat, 
                                                         p_value = p_value))
    
    # Create a data frame for plotting the permutation distribution
    perm_df <- data.frame(perm_stats = perm_stats)
    
    # Plot the permutation distribution with the observed statistic marked and p-value
    p <- ggplot(perm_df, aes(x = perm_stats)) +
      geom_histogram(bins = 30, fill = "lightblue", color = "black") +
      geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
      labs(title = paste("Permutation Test for", var),
           x = "Permuted Statistic",
           y = "Frequency",
           subtitle = paste("Observed Statistic =", round(observed_stat, 4), 
                            "| P-value =", format(p_value, digits = 4))) +  # Add p-value to subtitle
      theme_minimal()
    
    # Print the plot
    print(p)
  }
}

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                         Variable Observed_Stat p_value
## 1            Cardiomyopathy_HIGH    0.67487504  0.0004
## 2      Cardiomyopathy_start_lost    0.03608007  0.0001
## 3     Cardiomyopathy_stop_gained    0.27295893  0.0000
## 4        Cardiomyopathy_01_00001    2.19304371  0.0000
## 5           Cardiomyopathy_00001    0.55399147  0.0015
## 6  Cardiomyopathy_total_missense    8.02403705  0.0000
## 7             PLP_Cardiomyopathy    0.19359502  0.0000
## 8                  Epilepsy_HIGH    0.70615260  0.0091
## 9            Epilepsy_start_lost    0.01129570  0.0507
## 10          Epilepsy_stop_gained    0.34350191  0.0000
## 11             Epilepsy_01_00001    1.45746349  0.0000
## 12                Epilepsy_00001    0.56937910  0.0037
## 13       Epilepsy_total_missense    2.48950064  0.0002
## 14                  PLP_Epilepsy    0.01211653  0.5191

Create the Function to plot each column as a ratio to the means from Group 1

plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Control
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), 
         sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
  
  return(list(summary_data = data_summary))
}

Plot the data relative to group 1

# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)


# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))

# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
  # Append summary data to combined_coding_stats_summary_df
  combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}

# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
  filter(col_name %in% c(
                         "Cardiomyopathy_total_missense",
                         "Cardiomyopathy_01_00001",
                         "Cardiomyopathy_00001",
                         "Epilepsy_total_missense",
                         "Epilepsy_01_00001",
                         "Epilepsy_00001",
                         "PLP_Cardiomyopathy",
                         "PLP_Epilepsy",
                         "Cardiomyopathy_start_lost",
                         "Cardiomyopathy_stop_gained",
                         "Cardiomyopathy_HIGH",
                         "Epilepsy_start_lost",
                         "Epilepsy_HIGH",
                         "Epilepsy_stop_gained"
                         ),
         Category_Group != "Control") 


# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c(
                  "PLP_Epilepsy",
                  "Epilepsy_00001",
                  "Epilepsy_01_00001",
                  "Epilepsy_total_missense",
                  "Epilepsy_start_lost",
                  "Epilepsy_stop_gained",
                  "Epilepsy_HIGH",
                  "PLP_Cardiomyopathy",
                  "Cardiomyopathy_00001",
                  "Cardiomyopathy_01_00001",
                  "Cardiomyopathy_total_missense",
                  "Cardiomyopathy_start_lost",
                  "Cardiomyopathy_stop_gained",
                  "Cardiomyopathy_HIGH"
)


selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)

# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#3C8C78") +
  labs(title = "Ratio Compared to Control +/-SEM",
       y = "Variant",
       x = "Ratio to Control Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(-2, 12))


print(coding_stats_plot)

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                         Variable Observed_Stat p_value
## 1            Cardiomyopathy_HIGH    0.67487504  0.0004
## 2      Cardiomyopathy_start_lost    0.03608007  0.0001
## 3     Cardiomyopathy_stop_gained    0.27295893  0.0000
## 4        Cardiomyopathy_01_00001    2.19304371  0.0000
## 5           Cardiomyopathy_00001    0.55399147  0.0015
## 6  Cardiomyopathy_total_missense    8.02403705  0.0000
## 7             PLP_Cardiomyopathy    0.19359502  0.0000
## 8                  Epilepsy_HIGH    0.70615260  0.0091
## 9            Epilepsy_start_lost    0.01129570  0.0507
## 10          Epilepsy_stop_gained    0.34350191  0.0000
## 11             Epilepsy_01_00001    1.45746349  0.0000
## 12                Epilepsy_00001    0.56937910  0.0037
## 13       Epilepsy_total_missense    2.48950064  0.0002
## 14                  PLP_Epilepsy    0.01211653  0.5191

Input the data

# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene_for_manuscript_fixed_NEW.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript_fixed.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]

gene_data_discovery <- gene_data %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize

variants_per_gene_Cat_2_6 <- gene_data_discovery %>%
  filter(Category > 1) %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_2_6)
## # A tibble: 356 × 3
##    GENE    HIGH   PLP
##    <chr>  <int> <int>
##  1 A2ML1     57     0
##  2 AARS       6     0
##  3 ABAT       0     0
##  4 ABCC9      4     0
##  5 ACADVL     4     0
##  6 ACTC1      1     0
##  7 ACTL6B     0     0
##  8 ACTN2      7     0
##  9 ADAM22     3     0
## 10 ADSL       1     1
## # ℹ 346 more rows

Filter for Category == 1 and then group and summarize

variants_per_gene_Cat_1 <- gene_data_discovery %>%
  filter(Category == 1) %>%
  group_by(GENE) %>%
  summarise(
    HIGH = sum(HIGH, na.rm = TRUE),
    PLP = sum(PLP, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_1)
## # A tibble: 341 × 3
##    GENE    HIGH   PLP
##    <chr>  <int> <int>
##  1 A2ML1     69     0
##  2 AARS       0     0
##  3 ABAT       0     0
##  4 ABCC9      3     0
##  5 ACADVL     6     3
##  6 ACTL6B     0     2
##  7 ACTN2      1     0
##  8 ADAM22     0     0
##  9 ADSL       0     0
## 10 AGL        0     0
## # ℹ 331 more rows

Load gene lists from text files

genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")

Annotate gene with source panel

genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")

# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)

# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)

Append panel source to the gene_data

# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)

# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]

# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]

# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)

# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
##           ID    GENE HIGH PLP       Set Category         Source
## 1 CGM0000001   A2ML1    0   0 Discovery        1 Cardiomyopathy
## 2 CGM0000001    ABAT    0   0 Discovery        1       Epilepsy
## 3 CGM0000001  ADAM22    0   0 Discovery        1       Epilepsy
## 4 CGM0000001   AKAP9    0   0 Discovery        1 Cardiomyopathy
## 5 CGM0000001 ALDH5A1    0   0 Discovery        1       Epilepsy
## 6 CGM0000001   ALMS1    0   0 Discovery        1 Cardiomyopathy

Add the source to the category 1 variants list

variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_1)
## # A tibble: 6 × 4
##   GENE    HIGH   PLP Source        
##   <chr>  <int> <int> <chr>         
## 1 A2ML1     69     0 Cardiomyopathy
## 2 AARS       0     0 Epilepsy      
## 3 ABAT       0     0 Epilepsy      
## 4 ABCC9      3     0 Cardiomyopathy
## 5 ACADVL     6     3 Cardiomyopathy
## 6 ACTL6B     0     2 Epilepsy

Add the source to the category 2-6 variants list

variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_2_6)
## # A tibble: 6 × 4
##   GENE    HIGH   PLP Source        
##   <chr>  <int> <int> <chr>         
## 1 A2ML1     57     0 Cardiomyopathy
## 2 AARS       6     0 Epilepsy      
## 3 ABAT       0     0 Epilepsy      
## 4 ABCC9      4     0 Cardiomyopathy
## 5 ACADVL     4     0 Cardiomyopathy
## 6 ACTC1      1     0 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
  mutate(Category = "1")  

# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
  mutate(Category = "2-6")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)

# Print the combined dataset
print(combined_variants)
## # A tibble: 697 × 5
##    GENE    HIGH   PLP Source         Category
##    <chr>  <int> <int> <chr>          <chr>   
##  1 A2ML1     69     0 Cardiomyopathy 1       
##  2 AARS       0     0 Epilepsy       1       
##  3 ABAT       0     0 Epilepsy       1       
##  4 ABCC9      3     0 Cardiomyopathy 1       
##  5 ACADVL     6     3 Cardiomyopathy 1       
##  6 ACTL6B     0     2 Epilepsy       1       
##  7 ACTN2      1     0 Cardiomyopathy 1       
##  8 ADAM22     0     0 Epilepsy       1       
##  9 ADSL       0     0 Epilepsy       1       
## 10 AGL        0     0 Cardiomyopathy 1       
## # ℹ 687 more rows

Plot the number of variant types in cases and controls

# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)

# Create a label function
custom_labeller <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Create the plot with custom labeller
High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "High Variants Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller("Category", x))

# Print the plot
print(High_variant_plot)

Plot the number of variant types in cases and controls

# Create a custom labeller function for PLP_plot
custom_labeller_plp <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)

# Create the PLP plot with custom labeller
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "PLP Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller_plp("Category", x))

# Print the plot
print(PLP_plot)

Now bring in the module data

modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")

annotate the gene_data with the modules

module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))

# View the first few rows of the updated data
head(module_data)
##           ID    GENE HIGH PLP       Set Category         Source
## 1 CGM0000001   A2ML1    0   0 Discovery        1 Cardiomyopathy
## 2 CGM0000001    ABAT    0   0 Discovery        1       Epilepsy
## 3 CGM0000001  ADAM22    0   0 Discovery        1       Epilepsy
## 4 CGM0000001   AKAP9    0   0 Discovery        1 Cardiomyopathy
## 5 CGM0000001 ALDH5A1    0   0 Discovery        1       Epilepsy
## 6 CGM0000001   ALMS1    0   0 Discovery        1 Cardiomyopathy
##               Module
## 1          Rasopathy
## 2               <NA>
## 3               <NA>
## 4  Membrane_polarity
## 5               <NA>
## 6 fate_specification

NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants

#Filter NAs ans aggregate the data
module_data <- module_data %>%
  filter(!is.na(Module))

module_data <- module_data %>%
  mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))

# Count up the variants
module_summary <- module_data %>%
  group_by(Module, Category_Group) %>%
  summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
  ungroup() 
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n()) %>%
  ungroup()

# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
  left_join(genes_per_module, by = c("Module" = "Module"))

# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_PLP_variant / Num_Genes)

# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
  filter(Category_Group == "Categories 2-6")

module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
  arrange(desc(Sort_Metric)) %>%
  .$Module

module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of PLPs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
  theme_minimal() +
  labs(title = "Total PLP by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_plot)

Build the interaction plot data

#Count the High effect variants
aggregated_data <- module_data %>%
  group_by(ID, Module, Category) %>%
  summarise(High_count = sum(HIGH),
            .groups = 'drop')

#split off controls
data_cat_1 <- aggregated_data %>%
  filter(Category %in% c(1))

#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_cat_1 %>%
  group_by(ID) %>%
  summarise(High_count_any = any(High_count > 0)) %>%
  filter(High_count_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_cat_1 %>%
  semi_join(ids_with_both_1, by = "ID")

#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
  left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')



#split off cases
data_cat_2_6 <- aggregated_data %>%
  filter(Category %in% c(2,3,4,5,6))

#identify IDs with any 'High_count' greater than 0
ids_with_both_2_6 <- data_cat_2_6 %>%
  group_by(ID) %>%
  summarise(Missense_any = any(High_count > 0)) %>%
  filter(Missense_any) %>%
  dplyr::select(ID)

# Filter to include only rows that have IDs with High_count > 0
modules_with_both_2_6 <- data_cat_2_6 %>%
  semi_join(ids_with_both_2_6, by = "ID")


#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_2_6 <- modules_with_both_2_6 %>%
  group_by(ID) %>%
  do({
    data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
  }) %>%
  dplyr::rename(Module1 = X1, Module2 = X2) %>%
  left_join(modules_with_both_2_6, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
  filter(High_count_2 > 0 | High_count_1 > 0)

# Count occurrences of module pairs across IDs
module_pair_counts_2_6 <- module_pairs_for_ids_2_6 %>%
  group_by(Module1, Module2) %>%
  summarise(Count = n_distinct(ID), .groups = 'drop')


# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_2_6, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0  # Replace NA with 0

# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_1"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_2_6"

# Add number of people per group for totals not in counts
comparison_df$Not_Count_1 <- sum(cohort$arrest_ontology == "1") - comparison_df$Count_1
comparison_df$Not_Count_2_6 <- sum(cohort$arrest_ontology %in% c(2,3,4,5,6)) - comparison_df$Count_2_6

# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(count_1, not_count_1, count_2_6, not_count_2_6) {
  contingency_table <- matrix(c(count_1, not_count_1, count_2_6, not_count_2_6), 
                              nrow = 2, 
                              dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_2_6")))
  
  fisher_result <- fisher.test(contingency_table)
  
  return(fisher_result$p.value)
}

# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test, 
                                comparison_df$Count_1, 
                                comparison_df$Not_Count_1, 
                                comparison_df$Count_2_6, 
                                comparison_df$Not_Count_2_6)



comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))

# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)

# Print significant module pairs
print(significant_pairs)
##               Module1            Module2 Count_1 Count_2_6 Not_Count_1
## 1                CICR          sarcomere      68       104         528
## 2                 DGC fate_specification       5        28         591
## 3                 DGC                ICD      23        51         573
## 4                 DGC  Membrane_polarity      14        41         582
## 5                 DGC       Proteostasis       4        24         592
## 6                 DGC          sarcomere      27        68         569
## 7  fate_specification       mitochondria      15        37         581
## 8  fate_specification   nuclear_envelope       2        17         594
## 9  fate_specification          sarcomere      28        73         568
## 10                ICD fate_specification      24        57         572
## 11                ICD  Membrane_polarity      33        68         563
## 12                ICD       Proteostasis      23        55         573
## 13                ICD          sarcomere      45        95         551
## 14  lysosomal_storage          sarcomere      25        61         571
## 15  Membrane_polarity fate_specification      15        46         581
## 16  Membrane_polarity  lysosomal_storage      12        35         584
## 17  Membrane_polarity       mitochondria      24        50         572
## 18  Membrane_polarity       Proteostasis      14        43         582
## 19  Membrane_polarity          sarcomere      37        83         559
## 20       mitochondria          sarcomere      37        77         559
## 21       Proteostasis fate_specification       5        31         591
## 22       Proteostasis          sarcomere      26        72         570
## 23          Rasopathy          sarcomere      92       127         504
##    Not_Count_2_6      p_value adjusted_p_value
## 1            419 9.184664e-05     8.358044e-03
## 2            495 7.980239e-06     7.262018e-04
## 3            472 9.256654e-05     8.423555e-03
## 4            482 2.248122e-05     2.045791e-03
## 5            499 2.376165e-05     2.162310e-03
## 6            455 4.739262e-07     4.312729e-05
## 7            486 3.201544e-04     2.913405e-02
## 8            506 2.162122e-04     1.967531e-02
## 9            450 6.060785e-08     5.515314e-06
## 10           466 1.366726e-05     1.243721e-03
## 11           455 1.500246e-05     1.365223e-03
## 12           468 1.660428e-05     1.510989e-03
## 13           428 1.046357e-07     9.521851e-06
## 14           462 2.895091e-06     2.634532e-04
## 15           477 4.241213e-06     3.859504e-04
## 16           488 1.330254e-04     1.210531e-02
## 17           473 2.536897e-04     2.308576e-02
## 18           480 8.210955e-06     7.471969e-04
## 19           440 1.988805e-07     1.809813e-05
## 20           446 2.651718e-06     2.413064e-04
## 21           492 1.281642e-06     1.166294e-04
## 22           451 3.284151e-08     2.988577e-06
## 23           396 2.121104e-04     1.930205e-02

Make the plot for module makeup

#merge the high effect count data 
High_data <- rbind(data_cat_1, data_cat_2_6)

#assign the case/control categories
High_data <- High_data %>%
  mutate(Category_group = case_when(
    Category == 1 ~ "1",
    TRUE ~ "2-6"
  ))

#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)

# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
  left_join(df_gene_counts, by = "Module")

High_data_scaled <- High_data_scaled %>%
  mutate(High_count_per_gene = High_count / Num_Genes)

#compute the means
averages_sem_scaled <- High_data_scaled %>%
  dplyr::group_by(Module, Category_group) %>%
  dplyr::summarize(
    Mean = mean(High_count_per_gene, na.rm = TRUE),
    SD = sd(High_count_per_gene, na.rm = TRUE),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )

#run the t-tests to compare the modules per case or control (Category_group)
test_results <- High_data_scaled %>%
  group_by(Module) %>%
  do({
    ttest <- t.test(High_count_per_gene ~ Category_group, data = .)
    tidy(ttest)
  }) %>%
  ungroup()  

# show t-test data
print(test_results)
## # A tibble: 14 × 11
##    Module      estimate estimate1 estimate2 statistic p.value parameter conf.low
##    <chr>          <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>
##  1 CICR        -6.86e-3  0.0117     0.0185     -1.83  6.77e-2      712. -0.0142 
##  2 DGC         -3.59e-3  0.000798   0.00439    -2.49  1.31e-2      637. -0.00642
##  3 ICD         -7.94e-3  0.00559    0.0135     -2.34  1.96e-2      702. -0.0146 
##  4 Membrane_p… -6.76e-3  0.00101    0.00777    -3.95  8.96e-5      483. -0.0101 
##  5 Proteostas… -4.79e-3  0.000532   0.00533    -3.01  2.75e-3      509. -0.00792
##  6 Rasopathy   -3.10e-3  0.00800    0.0111     -2.06  3.93e-2      897. -0.00604
##  7 SNS_PNS     -2.33e-3  0.00463    0.00696    -1.04  3.00e-1      621. -0.00673
##  8 Z_disc       1.02e-2  0.0580     0.0478      1.39  1.65e-1      783. -0.00419
##  9 cytokine     2.88e-2  0.115      0.0866      3.09  2.06e-3      984.  0.0105 
## 10 fate_speci… -6.87e-3  0.000508   0.00738    -3.41  7.18e-4      475. -0.0108 
## 11 lysosomal_… -2.17e-4  0.00214    0.00236    -0.118 9.06e-1      871. -0.00382
## 12 mitochondr… -1.84e-3  0.000968   0.00281    -2.37  1.83e-2      596. -0.00337
## 13 nuclear_en… -3.13e-3  0          0.00313    -2.01  4.53e-2      318  -0.00620
## 14 sarcomere   -1.99e-2  0.00314    0.0230     -4.59  5.60e-6      475. -0.0284 
## # ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
  dplyr::group_by(Module, Category_group) %>%
  dplyr::summarize(
    Mean = mean(High_count_per_gene),
    SD = sd(High_count_per_gene),
    N = n(),
    SEM = SD / sqrt(N),
    .groups = 'drop'
  )

# label function for the fill legend
custom_fill_labels <- c("1" = "Control", "2-6" = "Case")

# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category_group, group = Category_group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
  geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM), width = .2, position = position_dodge(.8)) +
  scale_fill_manual(values = c("1" = "#FF6464", "2-6" = "#05618F"), labels = custom_fill_labels) +
  labs(x = "Module", y = "Average High Count Per Gene") +
  theme_cowplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Show plot
print(High_modules_plot_scaled)

Plot the interaction network

#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
  mutate(Relative_Counts = (Count_2_6 / (sum(cohort$arrest_ontology %in% c(2,3,4,5,6))))/(Count_1 / (sum(cohort$arrest_ontology == "1"))))

#assign in the module gene count data
df_gene_counts <- data.frame(
  Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis", 
             "Rasopathy", "SNS_PNS", "Z_disc", "cytokine", 
             "fate_specification", "lysosomal_storage", "mitochondria", 
             "nuclear_envelope", "sarcomere"),
  Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
  filter(adjusted_p_value < 0.05)

# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)

# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]

# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts


# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
  geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
  geom_node_point(aes(size = Num_Genes), color = "Black") +
  geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
  scale_size_continuous(range = c(3, 10)) +
  theme_graph()

print(HIGH_network)
## Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `transform` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Make the box plot for the module abundance

#Filter NAs ans aggregate the data
module_data <- module_data %>%
  filter(!is.na(Module))

module_data <- module_data %>%
  mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))

#count up the variants
module_summary <- module_data %>%
  group_by(Module, Category_Group) %>%
  summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE)) %>%
  ungroup() 
## `summarise()` has grouped output by 'Module'. You can override using the
## `.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
  group_by(Module) %>%
  summarise(Num_Genes = n()) %>%
  ungroup()

# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
  left_join(genes_per_module, by = c("Module" = "Module"))

# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
  mutate(Relative_Size = Total_HIGH_variant / Num_Genes)

# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
  filter(Category_Group == "Categories 2-6")

#order them by mean count
module_order <- module_summary_filtered %>%
  group_by(Module) %>%
  summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE)) %>%
  arrange(desc(Sort_Metric)) %>%
  .$Module

module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)

# Now plot with the reordered Module
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_HIGH_variant, color = Relative_Size)) +
  geom_point(shape = 15, alpha = 1) +
  scale_size(range = c(3, 20), name = "Number of HIGHs") + 
  scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
  theme_minimal() +
  labs(title = "Total HIGH by Module (Cases)",
       x = "Module",
       y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(modules_HIGH_plot)

Pull in uniprot data

# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
## [1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)

# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
## [1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)

# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
## [1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)

Plot PLP variants

MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)

variants <- data.frame(
  begin = c(1712,1057,908,904,869,741,723,576,369,355),
  y = rep(1, 10)  
)


# Now, add these points to your plot
MYH7_plot <- MYH7_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYH7_plot

##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)

#######Note, 7 variants are not shown becasue they are splice variants

variants <- data.frame(
  begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
  y = rep(1, 9)  
)

# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())

MYBPC3_plot

##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)

#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
  begin = c(1784,1595,1319,1316,845,828,814),
  y = rep(1, 7)  
)


# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot + 
  geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)

# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) + 
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        axis.ticks = element_blank(), 
        axis.text.y = element_blank(),
        panel.border = element_blank())


SCN5A_plot

Nonoding******************************************************************************************************************

Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest

# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
  X1 = col_character(), # Chromosome
  X2 = col_double(),    # Start Position
  X3 = col_double()     # End Position
))

Plot noncoding interval size histogram

# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2

# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)

# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
  geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
  geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
  xlab("Interval Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Interval Sizes") +
  theme_cowplot(12)

interval_size_histogram

Report the central tendancy

mean(bed_data$interval_size)
## [1] 497.4085
median(bed_data$interval_size)
## [1] 396

Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to

# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
  Chromosome = col_character(),
  Start = col_double(),
  End = col_double(),
  Chromosome2 = col_character(),
  GeneStart = col_double(),
  GeneEnd = col_double(),
  GeneNames = col_character()
))

# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>% 
  mutate(GeneNames = strsplit(GeneNames, ";")) %>%
  unnest(GeneNames)

# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Lorenzo_panel_genes_simple.txt")

# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
  filter(GeneNames %in% gene_list)

# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
  count(GeneNames)

print(gene_count)
## # A tibble: 347 × 2
##    GeneNames     n
##    <chr>     <int>
##  1 A2ML1        15
##  2 ABAT         45
##  3 ABCC9        35
##  4 ACADVL        3
##  5 ACTC1         9
##  6 ACTL6B       13
##  7 ACTN2        21
##  8 ADAM22        3
##  9 ADSL         35
## 10 AGL          59
## # ℹ 337 more rows

Plot histogram that is number of regions that map to the genes of interest

# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)

# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
  geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
  xlab("Number of Regions") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of the Distribution of Number of Regions per Gene")

regions_per_gene

Report the number of genes per region

mean(gene_count$n)
## [1] 31.40922
median(gene_count$n)
## [1] 24

Report the region sizes

bed_data$region_size <- bed_data$End - bed_data$Start

# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
  group_by(GeneNames) %>%
  summarise(TotalRegionSize = sum(region_size))

mean(total_region_size_per_gene$TotalRegionSize)
## [1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
## [1] 12010

Plot total sequence space per gene

# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
  geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
  xlab("Total Region Size") +
  ylab("Frequency") +
  ggtitle("Histogram of Total Region Sizes per Gene")

Pull information about the TSS for each gene

gene_list <- unique(bed_data$GeneNames)

# Select hg19 from Ensembl 
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://grch37.ensembl.org")

# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
                  filters = 'hgnc_symbol',
                  values = gene_list,
                  mart = ensembl)

find TSS distance

# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
  if (all(df$strand == 1)) {
    return(data.frame(transcription_start_site = min(df$transcription_start_site)))  # Forward strand
  } else {
    return(data.frame(transcription_start_site = max(df$transcription_start_site)))  # Reverse strand
  }
}

# Apply the function to each group of genes
tss_upstream <- tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.))


# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")

# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2

# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)

Distance to TSS

mean(merged_data$DistanceToTSS)
## [1] 264213.1
median(merged_data$DistanceToTSS)
## [1] 176294

Plot distance to TSS histogram

# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)

dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
  geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
  geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
  xlab("Distance to TSS (bp)") +
  ylab("Frequency") +
  theme_cowplot(12) +
  ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
  scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)

dist_tss

Pull all TSSs

# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
  attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
  mart = ensembl
)

# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
  filter(hgnc_symbol != "")

See if the peak is closer to another TSS than the contacted one!

# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
  if (nrow(df) == 0) {
    return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
  }
  if (all(df$strand == 1)) {
    # Forward strand
    tss <- min(df$transcription_start_site, na.rm = TRUE)
  } else {
    # Reverse strand
    tss <- max(df$transcription_start_site, na.rm = TRUE)
  }
  chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
  return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}

# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
  group_by(hgnc_symbol) %>%
  do(select_upstream_tss(.)) %>%
  ungroup() 

# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)

Find the closest transcription start site (TSS) to each peak center and calculate the distance.

# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
  seqnames = all_tss_upstream$chromosome_name,
  ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)

# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
  seqnames = merged_data$Chromosome,
  ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)

# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)

# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]

# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name

# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)

Check if the closest TSS gene matches the originally contacted gene

# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene

Compute and print the percentage of matches and mismatches between the contacted gene and the closest TSS gene.

# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)

fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))

# Print the fraction
print(100*fraction_match)
## [1] 13.26727
print(100*fraction_mismatch)
## [1] 86.73273

Lets see which genes have regions mapped to them that are closer to other TSSs

# Aggregate data by GeneNames
gene_summary <- merged_data %>%
  dplyr::group_by(GeneNames) %>%
  dplyr::summarize(
    any_true = any(gene_match, na.rm = TRUE),
    any_false = any(!gene_match, na.rm = TRUE),
    all_true = all(gene_match, na.rm = TRUE),
    all_false = all(!gene_match, na.rm = TRUE)
  ) %>%
  ungroup()

# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16

percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false) 
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true) 
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false) 
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) 

# Print the results
cat("Total genes in panel", 363 ,"\n")
## Total genes in panel 363
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
## Genes with no ATAC peaks: 4.407713 %,  16
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
## Genes with at least one TRUE: 42.69972 %,  155
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
## Genes with at least one FALSE: 93.66391 %,  340
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
## Genes with only TRUE: 1.928375 %,  7
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
## Genes with only FALSE: 52.89256 %,  192
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
## Genes with both TRUE and FALSE: 40.77135 %,  148

input the count data

noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual_for_manuscript_fixed.txt", header = TRUE, sep = "\t")

Categorize the data

categorized_noncoding_stats <- noncoding_stats %>%
  mutate(Category_Group = case_when(
    Category == 1 ~ "Control",
    Category %in% 2:6 ~ "Case"
  ))

Aggregate and clean

#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
  dplyr::select(-ID, -Category) %>% 
  na.omit() %>% 
  dplyr::filter_all(all_vars(!is.infinite(.))) 

# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
  group_by(Category_Group) %>%
  summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE), 
                                           std = ~sd(., na.rm = TRUE))))

# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)

Perform permutation test

# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")

# Define the permutation function
permutation_test <- function(data, var, group_var, num_permutations = 10000) {
  
  # Calculate the observed difference in means between the two groups
  observed_stat <- abs(mean(data[[var]][data[[group_var]] == "Case"]) - 
                       mean(data[[var]][data[[group_var]] == "Control"]))
  
  # Create an empty vector to store the permutation statistics
  perm_stats <- numeric(num_permutations)
  
  # Perform the permutations
  for (i in 1:num_permutations) {
    # Randomly shuffle the group labels
    permuted_group <- sample(data[[group_var]])
    
    # Calculate the difference in means for the permuted data
    permuted_stat <- abs(mean(data[[var]][permuted_group == "Case"]) - 
                         mean(data[[var]][permuted_group == "Control"]))
    
    # Store the permuted statistic
    perm_stats[i] <- permuted_stat
  }
  
  # Calculate the p-value (proportion of permuted stats >= observed stat)
  p_value <- mean(perm_stats >= observed_stat)
  
  return(list(observed_stat = observed_stat, perm_stats = perm_stats, p_value = p_value))
}

# Initialize a dataframe to store p-values for each variable
p_value_results <- data.frame(Variable = character(), Observed_Stat = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

# Loop through each variable and perform the permutation test
for (var in noncoding_variables_to_analyze) {
  # Run the permutation test
  test_result <- permutation_test(categorized_noncoding_stats, var, "Category_Group")
  
  # Extract the permuted statistics, observed statistic, and p-value
  perm_stats <- test_result$perm_stats
  observed_stat <- test_result$observed_stat
  p_value <- test_result$p_value
  
  # Store the p-value and observed statistic in the dataframe
  p_value_results <- rbind(p_value_results, data.frame(Variable = var, 
                                                       Observed_Stat = observed_stat, 
                                                       p_value = p_value))
  
  # Create a data frame for plotting the permutation distribution
  perm_df <- data.frame(perm_stats = perm_stats)
  
  # Plot the permutation distribution with the observed statistic marked and p-value
  p <- ggplot(perm_df, aes(x = perm_stats)) +
    geom_histogram(bins = 30, fill = "lightblue", color = "black") +
    geom_vline(aes(xintercept = observed_stat), color = "red", linetype = "dashed", size = 1) +
    labs(title = paste("Permutation Test for", var),
         x = "Permuted Statistic",
         y = "Frequency",
         subtitle = paste("Observed Statistic =", round(observed_stat, 4), 
                          "| P-value =", format(p_value, digits = 4))) +  # Add p-value to subtitle
    theme_minimal()
  
  # Print the plot
  print(p)
}

# Print the table of p-values
print("P-value Results for All Variables:")
## [1] "P-value Results for All Variables:"
print(p_value_results)
##                      Variable Observed_Stat p_value
## 1               variant_count    133.718673   0.000
## 2       Epilepsy_gnomAD.0.001      1.972116   0.013
## 3          Epilepsy_ultrarare      9.608424   0.000
## 4 Cardiomyopathy_gnomAD.0.001      5.523130   0.000
## 5    Cardiomyopathy_ultrarare      7.756665   0.000

Define a function to compute the ratio of each column’s mean relative to the Control group and calculate confidence intervals

# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
  # Calculate the mean and standard deviation for Group 1
  group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
  
  # Calculate the ratio for each group relative to Group 1
  data_summary <- data %>%
  group_by(Category_Group) %>%
  summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
            sd_value = sd(.data[[col_name]], na.rm = TRUE), 
            n = n()) %>%
  mutate(ratio = mean_value / group1_mean,
         sem_value = sd_value / sqrt(n), #  SEM calculation
         sem_ratio = sem_value / group1_mean, # SEM ratio 
         ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
         ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI

  return(list(summary_data = data_summary))
}

Iterate over selected columns, compute summary statistics, and store results in a dataframe.

# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))


# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = 
                                                    numeric(), std = numeric(), stringsAsFactors = FALSE)

# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
  plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
  # Append summary data to combined_noncoding_stats_summary_df
  combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}

Filter and factorize selected noncoding variants, then generate a plot showing their mean ratios compared to the Control group

# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
  filter(col_name %in% c(
                        "Cardiomyopathy_ultrarare",
                       "Epilepsy_ultrarare"),
         Category_Group != "Control") 


# Define the specific order for noncoding variants
levels_order <- c(
                      "Epilepsy_ultrarare",
                      "Cardiomyopathy_ultrarare"
                       )

# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)

# Plot 
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
  geom_point(position = position_dodge(width = 1), size = 3) +
  geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_manual(values = "#FF6464") +
  labs(title = "Ratio Compared to Group 1 +/-SEM",
       y = "Variant",
       x = "Ratio to Group 1 Mean",
       color = "Category Group") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 8), 
        axis.text.y = element_text(size = 8, hjust = 1),
        axis.title.x = element_text(size = 8), 
        axis.title.y = element_text(size = 8),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 8),
        plot.title = element_text(size = 8, hjust = 0.5),
        plot.subtitle = element_text(size = 8),
        plot.caption = element_text(size = 8),
        plot.margin = margin(15, 15, 15, 15)) +
  scale_x_continuous(limits = c(0.75, 2))


print(noncoding_stats_plot)

Input the data (change to your path)

# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene_for_manuscript_fixed.csv")

# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript_fixed.txt", sep = "\t", header = TRUE)

# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]

#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
  filter(Set == "Discovery")

Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize

variants_per_gene_Cat_2_6 <- gene_data_noncoding_discovery %>%
  filter(Category > 1) %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_2_6)
## # A tibble: 424 × 2
##    GENE       noncoding_ultrarare
##    <chr>                    <int>
##  1 A2ML1                       95
##  2 AAMP                        10
##  3 ABAT                        64
##  4 ABCC9                        0
##  5 AC011551.3                   0
##  6 ACADVL                      27
##  7 ACTC1                       31
##  8 ACTL6B                     172
##  9 ACTN2                       28
## 10 ADAM22                       0
## # ℹ 414 more rows

Filter for Category == 1 and then group and summarize

variants_per_gene_Cat_1 <- gene_data_noncoding_discovery %>%
  filter(Category == 1) %>%
  group_by(GENE) %>%
  summarise(
    noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(variants_per_gene_Cat_1)
## # A tibble: 424 × 2
##    GENE       noncoding_ultrarare
##    <chr>                    <int>
##  1 A2ML1                       37
##  2 AAMP                         1
##  3 ABAT                        50
##  4 ABCC9                        0
##  5 AC011551.3                   0
##  6 ACADVL                       9
##  7 ACTC1                        7
##  8 ACTL6B                     130
##  9 ACTN2                       16
## 10 ADAM22                       0
## # ℹ 414 more rows

Append panel source to the gene_data

# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)

# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]

# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'

# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
##           ID         GENE noncoding_ultrarare       Set  X X.1 Category
## 1 CGM0000969       RANGRF                   0 Discovery NA  NA        2
## 2 CGM0000969          AGL                   0 Discovery NA  NA        2
## 3 CGM0000969        RNF13                   0 Discovery NA  NA        2
## 4 CGM0000969       PRKAG2                   0 Discovery NA  NA        2
## 5 CGM0000969         JPH2                   1 Discovery NA  NA        2
## 6 CGM0000969 RP11-156P1.2                   0 Discovery NA  NA        2
##           Source
## 1 Cardiomyopathy
## 2 Cardiomyopathy
## 3       Epilepsy
## 4 Cardiomyopathy
## 5 Cardiomyopathy
## 6           <NA>

Add the source to the category 1 variants list

variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_1)
## # A tibble: 6 × 3
##   GENE       noncoding_ultrarare Source        
##   <chr>                    <int> <chr>         
## 1 A2ML1                       37 Cardiomyopathy
## 2 AAMP                         1 <NA>          
## 3 ABAT                        50 Epilepsy      
## 4 ABCC9                        0 Cardiomyopathy
## 5 AC011551.3                   0 <NA>          
## 6 ACADVL                       9 Cardiomyopathy

Add the source to the category 2-6 variants list

variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]

head(variants_per_gene_Cat_2_6)
## # A tibble: 6 × 3
##   GENE       noncoding_ultrarare Source        
##   <chr>                    <int> <chr>         
## 1 A2ML1                       95 Cardiomyopathy
## 2 AAMP                        10 <NA>          
## 3 ABAT                        64 Epilepsy      
## 4 ABCC9                        0 Cardiomyopathy
## 5 AC011551.3                   0 <NA>          
## 6 ACADVL                      27 Cardiomyopathy

combine the variants together now

# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
  mutate(Category = "1")  

# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
  mutate(Category = "2-6")  

# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)

# Print the combined dataset
print(combined_variants)
## # A tibble: 848 × 4
##    GENE       noncoding_ultrarare Source         Category
##    <chr>                    <int> <chr>          <chr>   
##  1 A2ML1                       37 Cardiomyopathy 1       
##  2 AAMP                         1 <NA>           1       
##  3 ABAT                        50 Epilepsy       1       
##  4 ABCC9                        0 Cardiomyopathy 1       
##  5 AC011551.3                   0 <NA>           1       
##  6 ACADVL                       9 Cardiomyopathy 1       
##  7 ACTC1                        7 Cardiomyopathy 1       
##  8 ACTL6B                     130 Epilepsy       1       
##  9 ACTN2                       16 Cardiomyopathy 1       
## 10 ADAM22                       0 Epilepsy       1       
## # ℹ 838 more rows

Plot the number of noncoding variant types for the genes

# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
  filter(noncoding_ultrarare > 0) %>%
  filter(!is.na(Source))


# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
  if (variable == "Category") {
    return(ifelse(value == "1", "Control", "Case"))
  }
  return(value)
}

# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
  geom_tile() + 
  scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"), 
                       values = scales::rescale(c(0, 0.5, 1))) + 
  facet_wrap(~ Source, scales = "free_x", ncol = 1) + 
  labs(title = "Noncoding Ultrarare Heatmap",
       x = "Gene",
       y = "Category",
       fill = "Count") +
  theme_cowplot(12) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), 
        strip.background = element_blank(), 
        strip.placement = "outside") +
  scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))

# Print the plot
print(noncoding_ultrarare_plot)

Extract pLI constraint scores from gnomAD, compute enrichment stats for noncoding ultrarare variants, and plot their significance relative to pLI

# gnomAD_data pLI constraint data
gnomad_data <- fread("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/gnomad.v2.1.1.lof_metrics.by_gene.txt")

# Function to fetch pLI
get_pLI <- function(gene_symbol, gnomad_data) {
  result <- gnomad_data[gene == gene_symbol, .(gene, pLI)]
  if (nrow(result) == 0) {
    stop("Gene not found in gnomAD data.")
  }
  return(result)
}

# Initialize an empty data frame to store the results
final_merged_data <- data.frame(GENE = character(), p_value = numeric(), pLI = numeric(), Source = character(), ratio = numeric(), stringsAsFactors = FALSE)

# Process each panel "Source" group separately
for (source in unique(noncoding_ultrarare_data$Source)) {
  
  # Filter data by panel "Source"
  source_data <- noncoding_ultrarare_data %>% filter(Source == source)
  
  # Extract unique genes for this Source
  unique_genes <- unique(source_data$GENE)
  
  # Create a data frame to store pLI values
  pli_values <- data.frame(GENE = character(), pLI = numeric(), stringsAsFactors = FALSE)
  
  # Fetch pLI values for each gene
  for (gene in unique_genes) {
    try({
      pli_value <- get_pLI(gene, gnomad_data)
      pli_values <- rbind(pli_values, data.frame(GENE = gene, pLI = pli_value$pLI, stringsAsFactors = FALSE))
    }, silent = TRUE)
  }
  
  # Compute p-values and ratios for each gene 
  p_values <- source_data %>%
    group_by(GENE) %>%
    summarise(
      p_value = {
        control_count <- sum(noncoding_ultrarare[Category == 1], na.rm = TRUE)
        case_count <- sum(noncoding_ultrarare[Category == "2-6"], na.rm = TRUE)
        total_count <- sum(noncoding_ultrarare, na.rm = TRUE)
        expected_control <- total_count * (537 / (537 + 506))
        expected_case <- total_count * (506 / (537 + 506))
        chisq_stat <- ((control_count - expected_control)^2 / expected_control) + 
                      ((case_count - expected_case)^2 / expected_case)
        pchisq(chisq_stat, df = 1, lower.tail = FALSE)
      },
      ratio = {
        control_count <- sum(noncoding_ultrarare[Category == 1], na.rm = TRUE)
        case_count <- sum(noncoding_ultrarare[Category == "2-6"], na.rm = TRUE)
        if (is.na(control_count) | is.na(case_count) | control_count == 0) {
          NA
        } else {
          (case_count/506) / (control_count/537)
        }
      }
    )
  
  # Merge with pLI values
  merged_data <- merge(p_values, pli_values, by = "GENE")
  merged_data$Source <- source
  
  # Append to the final data frame
  final_merged_data <- rbind(final_merged_data, merged_data)
}

# Calculate the Bonferroni correction cutoff
num_tests <- nrow(final_merged_data)
bonferroni_cutoff <- 0.05 / num_tests

# Filter for Bonferroni corrected p-values
bonferroni_filtered_data <- final_merged_data %>%
  filter(p_value < bonferroni_cutoff)

# Filter for the top 12 lowest p-values within each source group
top10_labels <- bonferroni_filtered_data %>%
  group_by(Source) %>%
  slice_min(order_by = p_value, n = 12) %>%
  ungroup()

#Filter for only Cardio
cardio_data <- bonferroni_filtered_data %>%
  filter(Source == "Cardiomyopathy")

# Plot
reg_variant_plot <- ggplot(cardio_data, aes(y = -log10(p_value), x = pLI, color = ratio)) +
  geom_point() +
  geom_text(data = top10_labels, aes(label = GENE), hjust = 0.5, vjust = -0.5) +
  scale_color_gradient(low = "#3C8C78", high = "#dcb43c", na.value = "grey") +
  labs(title = paste("P-Values of Noncoding Ultrarare Variants vs. pLI by Source (Bonferroni, p <", format(bonferroni_cutoff, digits = 2), ")"),
       x = "pLI",
       y = "-log10(p-value)",
       color = "Ratio\n(Categories 2-6 / Category 1)") +
  theme_cowplot(12)

# Print the plot
print(reg_variant_plot)

This is added from HOMER results

motif_data <- data.frame(
  TranscriptionFactor = c("IRF4", "NFY", "NKX2.5", "PRDM1", "SMAD2", "ESRRA", "NFKB","ASCL1"),
  PValue = c(1.00E-23, 1.00E-19, 1.00E-18, 1.00E-17, 1.00E-17, 1.00E-17, 1.00E-15, 1.00E-13)
)

# Create the plot
motif_plot <- ggplot(motif_data, aes(y = TranscriptionFactor, x = -log10(PValue))) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Transcription Factor P-values",
       y = "Transcription Factor",
       x = "-log10(P-value)") +
    theme_cowplot(12)

motif_plot

Combined****************************************************************************************************************** Read the cohort data

# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data_for_manuscript_fixed_NEW.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)


combined_data <- total_data %>% filter(Set =="Discovery") 

replication_data <- total_data %>%  filter(Set =="Replication") 

Categorize the Data based on original categories and arrest status

#Clump by category
categorized_combined_data <- combined_data %>%
  mutate(Category = case_when(
    arrest_group == 1 ~ "Control",
    arrest_group %in% 2:6 ~ "zCase"
  )) %>%
  filter(!is.na(Category))

#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)

############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
  mutate(Category = case_when(
    arrest_group == 1 ~ "Control",
    arrest_group %in% 2:6 ~ "zCase"
  )) %>%
  filter(!is.na(Category))

# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)

Collapse the rare vars

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_combined_data <- categorized_combined_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_combined_data <- categorized_combined_data %>% 
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
         
categorized_combined_data <- categorized_combined_data %>% 
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))


############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
  Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
  Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))

categorized_replication_data <- categorized_replication_data %>%
  mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))

Perform multivariate Logistic Regression on everything

model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model
## 
## Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
##     Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
##     Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + 
##     Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + 
##     Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + 
##     Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
##     Cardiomyopathy_null + Epilepsy_null, family = binomial(), 
##     data = categorized_combined_data)
## 
## Coefficients:
##                   (Intercept)          Normalized_Heart_rate  
##                      -1.13276                       -0.06633  
##        Normalized_PR_interval                 Normalized_QTc  
##                       0.10545                       -0.06282  
##                 Normalized_BP                 Normalized_QRS  
##                       0.29888                        0.41302  
##               Normalized_LVEF               Normalized_LVESV  
##                      -0.17686                        0.16828  
##                Normalized_SVI                Normalized_Afib  
##                       0.05697                       -0.07965  
##             Normalized_LVEDVI                  Normalized_SV  
##                      -0.09490                       -0.09174  
##                 Epilepsy_rare             Epilepsy_ultrarare  
##                      -0.06083                       -0.01102  
##           Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
##                      -0.03503                       -0.01155  
##                  PLP_Epilepsy             PLP_Cardiomyopathy  
##                      -0.19700                        1.18960  
## Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
##                      -0.01056                        0.03214  
##           Cardiomyopathy_null                  Epilepsy_null  
##                       0.96647                        0.44278  
## 
## Degrees of Freedom: 992 Total (i.e. Null);  971 Residual
## Null Deviance:       1370 
## Residual Deviance: 1132  AIC: 1176

Compute the scores

# tell it how to make the scores
scores <- predict(model, type = "link")

scores_replication <- predict(model, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication

Perform multivariate Logistic Regressions based only on the input paramaters and subsets

model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())

model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())

model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, data = categorized_combined_data, family = binomial())

model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_combined_data, family = binomial())

Compute the rest of the scores and determine rank-based quintiles for all scores

categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")


categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank = rank(scores, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability = plogis(scores),
    cardiac_probability = plogis(cardiac_variant_score),
    epilepsy_probability = plogis(epilepsy_variant_score),
    noncoding_probability = plogis(noncoding_variant_score),
    common_probability = plogis(common_variant_score),
    common_and_coding_probability = plogis(common_and_coding_score),
    coding_rare_probability = plogis(coding_rare_score),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
    cardiac_and_common_probability = plogis(cardiac_and_common_score),
  )


# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
  # Compute counts and initial odds for each quintile
  odds_data <- data %>%
    dplyr::group_by({{ quintile_column }}) %>%
    dplyr::summarise(
      count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
      count_category_2_6 = sum({{ category_column }} == "zCase", na.rm = TRUE),
      .groups = 'drop'
    ) %>%
    dplyr::mutate(
      odds = count_category_2_6 / count_category_1
    )

  # Retrieve the odds of the first quintile as the reference
  first_quintile_odds <- odds_data$odds[1]

  # Calculate the odds ratio relative to the first quintile
  odds_data <- odds_data %>%
    dplyr::mutate(
      odds_ratio = odds / first_quintile_odds,
      se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_2_6),
      lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
      upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
    )

  return(odds_data)
}

# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category,  cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category,  epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category,  common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category,   noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)


################################################################################################################ replication


categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data,  type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data,  type = "link")


categorized_replication_data <- categorized_replication_data %>%
  mutate(
    rank = rank(scores_replication, ties.method = "first"),
    score_quintiles = ceiling(rank / (n() / 5)),
    rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
    cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
    rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
    epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
    rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
    noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
    rank_common = rank(common_variant_score_replication, ties.method = "first"),
    common_score_quintiles = ceiling(rank_common / (n() / 5)),
    rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
    common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
    rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
    coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
    rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
    epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
    rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
    cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
  )

categorized_replication_data <- categorized_replication_data %>%
  mutate(
    probability = plogis(scores_replication),
    cardiac_probability = plogis(cardiac_variant_score_replication),
    epilepsy_probability = plogis(epilepsy_variant_score_replication),
    noncoding_probability = plogis(noncoding_variant_score_replication),
    common_probability = plogis(common_variant_score_replication),
    common_and_coding_probability = plogis(common_and_coding_score_replication),
    coding_rare_probability = plogis(coding_rare_score_replication),
    epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
    cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
  )


# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,   common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category,  coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)

plot

plot(log2(combined_odds_summary$odds_ratio), 
     ylim = c(
         log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci, 
                   common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci, 
                   coding_rare_summary$lower_ci))), 
         log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci, 
                   common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci, 
                   coding_rare_summary$upper_ci)))
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 

     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio), 
       y0 = log2(combined_odds_summary$lower_ci), 
       y1 = log2(combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio), 
       y0 = log2(cardiac_odds_summary$lower_ci), 
       y1 = log2(cardiac_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio), 
       y0 = log2(epilepsy_summary$lower_ci), 
       y1 = log2(epilepsy_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio), 
       y0 = log2(common_summary$lower_ci), 
       y1 = log2(common_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio), 
       y0 = log2(noncoding_summary$lower_ci), 
       y1 = log2(noncoding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio), 
       y0 = log2(common_and_coding_summary$lower_ci), 
       y1 = log2(common_and_coding_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio), 
       y0 = log2(coding_rare_summary$lower_ci), 
       y1 = log2(coding_rare_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

plot replication

plot(log2(combined_odds_summary_replication$odds_ratio), 
          ylim = c(-2,10), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     
# Add error bars for 95% CI - Combined
          xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio), 
       y0 = log2(combined_odds_summary_replication$lower_ci), 
       y1 = log2(combined_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio), 
       y0 = log2(cardiac_odds_summary_replication$lower_ci), 
       y1 = log2(cardiac_odds_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#2E86C1")

# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio), 
       y0 = log2(epilepsy_summary_replication$lower_ci), 
       y1 = log2(epilepsy_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#A93226")

# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio), 
       y0 = log2(common_summary_replication$lower_ci), 
       y1 = log2(common_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#229954")

# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio), 
       y0 = log2(noncoding_summary_replication$lower_ci), 
       y1 = log2(noncoding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#D68910")

# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio), 
       y0 = log2(common_and_coding_summary_replication$lower_ci), 
       y1 = log2(common_and_coding_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Black")

# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio), 
       y0 = log2(coding_rare_summary_replication$lower_ci), 
       y1 = log2(coding_rare_summary_replication$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "Pink")

# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)

# legend 
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"), 
       col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)

Z test for odds significance

compare_odds <- function(odds1, odds2) {
  # Calculate the difference in log odds
  log_odds1 <- log(odds1$odds_ratio)
  log_odds2 <- log(odds2$odds_ratio)
  delta_log_odds <- log_odds1 - log_odds2
  
  # Calculate the standard error of the difference
  se1 <- odds1$se_log_odds_ratio
  se2 <- odds2$se_log_odds_ratio
  se_delta_log_odds <- sqrt(se1^2 + se2^2)
  
  # Z-test for each quintile
  z_values <- delta_log_odds / se_delta_log_odds
  p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)  
  
  # Return a data frame with the results
  return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}

# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)

# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"

# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
                               p_values_coding_vs_model,
                               p_values_common_coding_vs_model)

# Convert quintile to a factor
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)

combined_p_values
##    quintile     p_values                  comparison
## 1         1 1.0000000000 Coding vs Common and Coding
## 2         2 0.0302326527 Coding vs Common and Coding
## 3         3 0.1655321102 Coding vs Common and Coding
## 4         4 0.0016031606 Coding vs Common and Coding
## 5         5 0.0376069426 Coding vs Common and Coding
## 6         1 1.0000000000             Coding vs Model
## 7         2 0.0459511700             Coding vs Model
## 8         3 0.0041855535             Coding vs Model
## 9         4 0.0006605028             Coding vs Model
## 10        5 0.0006065780             Coding vs Model
## 11        1 1.0000000000  Common and Coding vs Model
## 12        2 0.8875234087  Common and Coding vs Model
## 13        3 0.1454995597  Common and Coding vs Model
## 14        4 0.8097807932  Common and Coding vs Model
## 15        5 0.1582202334  Common and Coding vs Model

Calculate the scores per category and plot them

#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2, 2) +  
    theme_cowplot(12)

cardiac_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 1) +  
    theme_cowplot(12)

epilepsy_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 1) +  
    theme_cowplot(12)

noncoding_variant_score_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-1, 0.75) +  
    theme_cowplot(12)

scores_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-3, 3) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_plot))

suppressWarnings(print(cardiac_variant_score_plot))

suppressWarnings(print(epilepsy_variant_score_plot))

suppressWarnings(print(noncoding_variant_score_plot))

suppressWarnings(print(scores_plot))

replication

Now plot the Z normalized replication data

mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)


common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +
    theme_cowplot(12)

cardiac_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
 ylim(-2.5, 2.5) +
  theme_cowplot(12)

epilepsy_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

noncoding_variant_score_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

scores_replication_plot <-   ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
    geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
     ylim(-2.5, 2.5) +
    theme_cowplot(12)

suppressWarnings(print(common_variant_score_replication_plot))

suppressWarnings(print(cardiac_variant_score_replication_plot))

suppressWarnings(print(epilepsy_variant_score_replication_plot))

suppressWarnings(print(noncoding_variant_score_replication_plot))

suppressWarnings(print(scores_replication_plot))

Compute the wilcox.tests

wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cardiac_variant_score by Category
## W = 76739, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score by Category
## W = 86944, p-value = 3.215e-15
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score by Category
## W = 77863, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  noncoding_variant_score by Category
## W = 101916, p-value = 5.208e-06
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores by Category
## W = 57835, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

replication

wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  cardiac_variant_score_replication by Category
## W = 850, p-value = 3.419e-08
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score_replication by Category
## W = 1286, p-value = 0.0006998
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score_replication by Category
## W = 1667, p-value = 0.1309
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  noncoding_variant_score_replication by Category
## W = 523, p-value = 1.213e-12
## alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication by Category
## W = 498, p-value = 4.973e-13
## alternative hypothesis: true location shift is not equal to 0

Plot the distribution of the categories by score

distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
  scale_fill_manual(values = group_colors) +
    scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Normalized Density Plots of Scores by Category",
       x = "Score",
       y = "Density") +
  theme_cowplot(12) 


suppressWarnings(print(distribution_score_plot))

Plot the filled density of the categories by score

density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) + 
  geom_density(position = "fill", alpha = 0.5) + 
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(limits = c(-3, 4)) +
  labs(title = "Stacked Density of Scores by Category",
       x = "Score",
       y = "Fraction") +
  theme_cowplot() +
  scale_fill_manual(values = group_colors)

# Print the plot
suppressWarnings(print(density_score_plot))

Plot the ROCs

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("Full model AUC:")
## [1] "Full model AUC:"
auc(roc_result_full)
## Area under the curve: 0.7638
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("commonvariant model AUC:")
## [1] "commonvariant model AUC:"
auc(roc_result_common)
## Area under the curve: 0.682
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_coding)
## Area under the curve: 0.7008
#cardiac and common variants
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("coding variant model AUC:")
## [1] "coding variant model AUC:"
auc(roc_result_cardiac_and_common)
## Area under the curve: 0.7364
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

paste("common and coding variant model AUC:")
## [1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
## Area under the curve: 0.7449

Plot the replication ROC and Precision-recall

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)

roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full_replication)
## Area under the curve: 0.874

Figure out the ROC improvement with each variant class and plot it

#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)

# Base AUC for random chance
auc_random_chance <- 0.5

# Create a data frame for plotting
percentage_changes_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  PercentageChange = c(
    (auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac - auc_random_chance) / auc_random_chance * 100,
    (auc_common - auc_random_chance) / auc_random_chance * 100,
    (auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_full - auc_random_chance) / auc_random_chance * 100
  )
)


auc_df <- data.frame(
  Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  auc = c(
    (auc_epilepsy),
    (auc_cardiac),
    (auc_common),
    (auc_epilepsy_and_common),
    (auc_cardiac_and_common),
    (auc_common_and_coding),
    (auc_coding),
    (auc_full)
  )
)

# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
  "Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))

# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Percentage Change Over Random Chance AUC",
       y = "Percentage Change (%)",
       x = "") +
  theme_cowplot(12)

print(auc_performance_plot)

# Calculate p-values using DeLong's test
p_values <- list(
  "Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
  "CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
  "Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
  "Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
  "Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
  "CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
  "Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
  "CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
  "Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
  "Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)

# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.03714183
## 
## $`CMAR vs Common`
## [1] 0.8236934
## 
## $`Epilepsy vs Common`
## [1] 0.05250968
## 
## $`Coding vs CMAR`
## [1] 0.1295995
## 
## $`Coding vs Epilepsy`
## [1] 4.40734e-05
## 
## $`CMAR and Common vs Common`
## [1] 3.718671e-06
## 
## $`Common vs Coding`
## [1] 0.336048
## 
## $`CMAR and common vs Common and Coding`
## [1] 0.06421658
## 
## $`Common vs Common and Coding`
## [1] 2.86077e-07
## 
## $`Full vs Common and Coding`
## [1] 0.004615593

Compute the deviance attributable to each class

# Perform ANOVA on the model
null_model <- glm(Category ~ 1, data = categorized_combined_data, family = binomial())

# Perform ANOVA on the full model
anova_results <- anova(model, test = "Chisq")

# Calculate the null deviance and the full model deviance
null_deviance <- deviance(null_model)
full_deviance <- deviance(model)

# Calculate the total sum of squares (using total deviance)
tss <- null_deviance - full_deviance

# Calculate the proportion of deviance explained by each predictor
anova_results$Proportion_Variance_Explained <- anova_results$`Deviance` / tss

# Display the results
anova_results
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Category
## 
## Terms added sequentially (first to last)
## 
## 
##                               Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                                            992     1370.0         
## Normalized_Heart_rate          1    8.094       991     1361.9  0.00444
## Normalized_PR_interval         1    6.193       990     1355.7  0.01282
## Normalized_QTc                 1    5.429       989     1350.3  0.01980
## Normalized_BP                  1   21.492       988     1328.8  0.00000
## Normalized_QRS                 1   32.145       987     1296.6  0.00000
## Normalized_LVEF                1   11.611       986     1285.0  0.00066
## Normalized_LVESV               1    2.819       985     1282.2  0.09316
## Normalized_SVI                 1    4.344       984     1277.8  0.03715
## Normalized_Afib                1    0.736       983     1277.1  0.39103
## Normalized_LVEDVI              1    6.693       982     1270.4  0.00968
## Normalized_SV                  1    1.424       981     1269.0  0.23282
## Epilepsy_rare                  1    7.220       980     1261.8  0.00721
## Epilepsy_ultrarare             1   11.748       979     1250.0  0.00061
## Cardiomyopathy_rare            1    6.754       978     1243.3  0.00935
## Cardiomyopathy_ultrarare       1    3.595       977     1239.7  0.05795
## PLP_Epilepsy                   1    0.009       976     1239.7  0.92564
## PLP_Cardiomyopathy             1   46.892       975     1192.8  0.00000
## Cardiomyopathy_noncoding_rare  1    3.390       974     1189.4  0.06561
## Epilepsy_noncoding_rare        1   35.430       973     1154.0  0.00000
## Cardiomyopathy_null            1   16.730       972     1137.2  0.00004
## Epilepsy_null                  1    4.795       971     1132.4  0.02855
##                               Proportion_Variance_Explained
## NULL                                                       
## Normalized_Heart_rate                              0.034075
## Normalized_PR_interval                             0.026072
## Normalized_QTc                                     0.022857
## Normalized_BP                                      0.090478
## Normalized_QRS                                     0.135322
## Normalized_LVEF                                    0.048878
## Normalized_LVESV                                   0.011867
## Normalized_SVI                                     0.018286
## Normalized_Afib                                    0.003097
## Normalized_LVEDVI                                  0.028176
## Normalized_SV                                      0.005993
## Epilepsy_rare                                      0.030394
## Epilepsy_ultrarare                                 0.049455
## Cardiomyopathy_rare                                0.028434
## Cardiomyopathy_ultrarare                           0.015135
## PLP_Epilepsy                                       0.000037
## PLP_Cardiomyopathy                                 0.197408
## Cardiomyopathy_noncoding_rare                      0.014270
## Epilepsy_noncoding_rare                            0.149152
## Cardiomyopathy_null                                0.070428
## Epilepsy_null                                      0.020185

REDO the “improvement” testing in the replication cohort

# Compute ROC curves for all models
roc_result_cardiac <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$epilepsy_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$cardiac_and_common_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_common_and_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$common_and_coding_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_coding <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$coding_rare_probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc_result_full <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = FALSE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)

# Base AUC for random chance
auc_random_chance <- 0.5

# Create a data frame for plotting
percentage_changes_df <- data.frame(
  Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  PercentageChange = c(
    (auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac - auc_random_chance) / auc_random_chance * 100,
    (auc_common - auc_random_chance) / auc_random_chance * 100,
    (auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
    (auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_coding - auc_random_chance) / auc_random_chance * 100,
    (auc_full - auc_random_chance) / auc_random_chance * 100
  )
)

auc_df <- data.frame(
  Model = c("Epilepsy", "Cardiac", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
  auc = c(
    auc_epilepsy,
    auc_cardiac,
    auc_common,
    auc_epilepsy_and_common,
    auc_cardiac_and_common,
    auc_common_and_coding,
    auc_coding,
    auc_full
  )
)

# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
  "Epilepsy", "Cardiac", "Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))

# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  theme_minimal() +
  labs(title = "Percentage Change Over Random Chance AUC",
       y = "Percentage Change (%)",
       x = "") +
  theme_cowplot(12)

print(auc_performance_plot)

# Calculate p-values using DeLong's test
p_values <- list(
  "Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
  "CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
  "Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
  "Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
  "Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
  "CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
  "Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
  "CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
  "Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
  "Full vs CMAR" = roc.test(roc_result_cardiac, roc_result_full)$p.value
)

# Print p-values
print(p_values)
## $`Epilepsy vs CMAR`
## [1] 0.05440609
## 
## $`CMAR vs Common`
## [1] 0.0005775265
## 
## $`Epilepsy vs Common`
## [1] 0.1143759
## 
## $`Coding vs CMAR`
## [1] 0.7027383
## 
## $`Coding vs Epilepsy`
## [1] 0.003616045
## 
## $`CMAR and Common vs Common`
## [1] 0.004465682
## 
## $`Common vs Coding`
## [1] 0.001003417
## 
## $`CMAR and common vs Common and Coding`
## [1] 0.1669517
## 
## $`Common vs Common and Coding`
## [1] 0.0009395074
## 
## $`Full vs CMAR`
## [1] 0.04508866

Lets see how common and rare cardiac vars interact

# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
  geom_point(alpha = 0.7) +  
  scale_color_manual(values = c("1" = "blue", "2-3" = "green", "4-6" = "red")) + 
  labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
  theme_cowplot(12) +
  theme(strip.text.x = element_text(size = 10))


# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
  geom_point(alpha = 1) +  
  scale_color_manual(values = group_colors) + 
  labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
  theme_cowplot(12) +
  theme(strip.text.x = element_text(size = 10))

dot_plot

median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)

# Filter for the top half
quadrant1 <- categorized_combined_data %>%
  filter(rank_cardiac <= median_cardiac & rank_common <= median_common)

quadrant2 <- categorized_combined_data %>%
  filter(rank_cardiac < median_cardiac & rank_common > median_common)

quadrant3 <- categorized_combined_data %>%
  filter(rank_cardiac > median_cardiac & rank_common < median_common)

quadrant4 <- categorized_combined_data %>%
  filter(rank_cardiac >= median_cardiac & rank_common >= median_common)


# Combine into one data frame
combined_quadrants <- bind_rows(
  quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
  quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
  quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
  quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)

# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
  group_by(Category, Quadrant) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  group_by(Category) %>%
  mutate(Total = sum(Count), Percentage = Count / Total * 100)

# View the result
percentage_by_category
## # A tibble: 8 × 5
## # Groups:   Category [2]
##   Category Quadrant   Count Total Percentage
##   <fct>    <chr>      <int> <int>      <dbl>
## 1 Control  Quadrant 1   235   537       43.8
## 2 Control  Quadrant 2   107   537       19.9
## 3 Control  Quadrant 3    98   537       18.2
## 4 Control  Quadrant 4    97   537       18.1
## 5 zCase    Quadrant 1    79   456       17.3
## 6 zCase    Quadrant 2    75   456       16.4
## 7 zCase    Quadrant 3    84   456       18.4
## 8 zCase    Quadrant 4   218   456       47.8
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
  count(Quadrant) %>%
  mutate(OverallProportion = n / sum(n))

# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
  count(Category) %>%
  dplyr::rename(TotalInCategory = n)

# Calculate expected counts
expected_counts <- combined_quadrants %>%
  dplyr::select(Category, Quadrant) %>%
  distinct() %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory)

# Calculate observed counts
observed_counts <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n)

# combine
combined_for_plotting <- combined_quadrants %>%
  count(Category, Quadrant) %>%
  dplyr::rename(Observed = n) %>%
  left_join(overall_counts, by = "Quadrant") %>%
  left_join(category_totals, by = "Category") %>%
  mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)


# Calculate the Observed/Expected ratio
combined_for_plotting <- combined_for_plotting %>%
  mutate(Ratio = Observed / Expected)

# Plot the Observed/Expected ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(y = "Observed/Expected Ratio", x = "Quadrant", title = "Observed/Expected Ratios by Category and Quadrant") +
  theme_cowplot(12) +
  scale_fill_manual(values = group_colors) +  
  geom_hline(yintercept = 1, linetype = "dashed", color = "Black")

quadrant_plot

# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)

# Print the contingency table
print(contingency_table)
##          Quadrant
## Category  Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
##   Control        235        107         98         97
##   zCase           79         75         84        218
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)

# Print the results of the chi-squared test
print(chi_squared_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 124.91, df = 3, p-value < 2.2e-16

Filter high cardiac-score individuals, visualize their cardiac vs. common variant scores, and assess interactions

# Filter the data for cardiac_score_quintiles > 3
top_2_quintiles <- categorized_combined_data %>%
  filter(cardiac_score_quintiles > 3)


common_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
  geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")

# Print 
print(common_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_common_cardiac)
## 
## Call:
## lm(formula = common_variant_score ~ cardiac_variant_score * Category, 
##     data = top_2_quintiles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8514 -0.3866  0.0569  0.4076  1.5320 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         -0.22986    0.05737  -4.006 7.38e-05 ***
## cardiac_variant_score               -0.16418    0.09335  -1.759   0.0794 .  
## CategoryzCase                        0.40285    0.07411   5.436 9.59e-08 ***
## cardiac_variant_score:CategoryzCase  0.16371    0.09635   1.699   0.0901 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6477 on 394 degrees of freedom
## Multiple R-squared:  0.1034, Adjusted R-squared:  0.09659 
## F-statistic: 15.15 on 3 and 394 DF,  p-value: 2.386e-09

Visualize the relationship between cardiac and epilepsy variant scores

epilepsy_density_score_plot <- ggplot(top_2_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
  geom_point(aes(color = Category), alpha = 0.5) +  
  geom_smooth(method = "lm", aes(group = Category, color = Category)) +  
  labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
  scale_color_manual(values = group_colors) +  
  theme_cowplot(12)  

# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
  geom_point(data = subset(top_2_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")


# Print 
print(epilepsy_density_score_plot)
## `geom_smooth()` using formula = 'y ~ x'

#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_2_quintiles)
summary(model_epilepsy_cardiac)
## 
## Call:
## lm(formula = epilepsy_variant_score ~ cardiac_variant_score * 
##     Category, data = top_2_quintiles)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8187 -0.4169 -0.0110  0.3811  9.3169 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -0.1577     0.0861  -1.832   0.0677 .  
## cardiac_variant_score                -0.1611     0.1401  -1.150   0.2509    
## CategoryzCase                        -0.2170     0.1112  -1.952   0.0517 .  
## cardiac_variant_score:CategoryzCase   0.9352     0.1446   6.468 2.95e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.972 on 394 degrees of freedom
## Multiple R-squared:  0.5589, Adjusted R-squared:  0.5555 
## F-statistic: 166.4 on 3 and 394 DF,  p-value: < 2.2e-16

Do the stats for the interaction data

# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
                                  data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))



CMARR_epilepsy_wilcox_test_results
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  epilepsy_variant_score by Category
## W = 11515, p-value = 4.869e-10
## alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
                                  data = top_2_quintiles %>% filter(Category %in% c("zCase", "Control")))


CMARR_common_wilcox_test_results
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  common_variant_score by Category
## W = 11649, p-value = 1.063e-09
## alternative hypothesis: true location shift is not equal to 0

Calculate how well GAPS performs in PLP- individuals

no_PLPs_categorized_data <- categorized_combined_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.83)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
## [1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
## [1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
## [1] 22
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
  filter(PLP_Cardiomyopathy < 1 & probability > 0.7)

# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
## [1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
## [1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
## [1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
## [1] 22

NOW, retrain the model on the validation cohort and apply it to the discovery cohort

model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare  + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare  + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null, 
             data = categorized_replication_data, family = binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication
## 
## Call:  glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval + 
##     Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
##     Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + 
##     Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + 
##     Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + 
##     Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
##     Cardiomyopathy_null + Epilepsy_null, family = binomial(), 
##     data = categorized_replication_data)
## 
## Coefficients:
##                   (Intercept)          Normalized_Heart_rate  
##                    -10.632909                      -0.489645  
##        Normalized_PR_interval                 Normalized_QTc  
##                     -0.468604                      -0.520174  
##                 Normalized_BP                 Normalized_QRS  
##                      0.909731                       0.538230  
##               Normalized_LVEF               Normalized_LVESV  
##                     -0.863017                      -0.791458  
##                Normalized_SVI                Normalized_Afib  
##                      0.005339                       0.842873  
##             Normalized_LVEDVI                  Normalized_SV  
##                     -0.774451                       0.390626  
##                 Epilepsy_rare             Epilepsy_ultrarare  
##                     -0.127564                      -1.765110  
##           Cardiomyopathy_rare       Cardiomyopathy_ultrarare  
##                     -0.560834                       1.434416  
##                  PLP_Epilepsy             PLP_Cardiomyopathy  
##                     -9.841170                       0.909586  
## Cardiomyopathy_noncoding_rare        Epilepsy_noncoding_rare  
##                     -0.051308                       0.361626  
##           Cardiomyopathy_null                  Epilepsy_null  
##                      6.534668                       0.974009  
## 
## Degrees of Freedom: 125 Total (i.e. Null);  104 Residual
## Null Deviance:       174.2 
## Residual Deviance: 47.6  AIC: 91.6
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data,  type = "link")

# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery

# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)


scores_replication_model_on_discovery_plot <-  ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_discovery_plot
## Warning: Removed 14 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)

# View the results
print(wilcox.test_result_replication_on_discovery)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication_model_on_discovery by Category
## W = 84150, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Calculate GAPS on betas trained and applied to replication data (within-replication model)

scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data,  type = "link")

# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication

# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)

scores_replication_model_on_replication_plot <-  ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
        geom_boxplot(outlier.shape = NA, notch = TRUE) +  
    scale_fill_manual(values = group_colors) +
    ylim(-2.5, 2.5) +  
    theme_cowplot(12)

scores_replication_model_on_replication_plot
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)

# View the results
print(wilcox.test_replication_on_replication)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  scores_replication_model_on_replication by Category
## W = 97, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

Now calculate AUC for the original data using replication-derived betas (reverse validation)

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
  )

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)

roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full)
## Area under the curve: 0.6564
# Calculate the confidence interval for the AUC
ci_auc <- ci.auc(roc_result_full)

# Print the confidence interval
print(ci_auc)
## 95% CI: 0.6221-0.6906 (DeLong)

Replication model on discovery data Odds Ratio

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
    replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
  )

 
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)


plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
     ylim = c(-2,10
     ), 
     pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI", 
     xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")

# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), 
       y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci), 
       y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci), 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

ANCESTRY

Determine optimal PCs where R2 is not dropping and when RMSE does not rise

# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")

# Define the number of PCs to try
num_pcs <- 1:20

# Initialize a list to store model results
cv_results <- data.frame(PCs = num_pcs, R2 = NA, RMSE = NA)

# Perform cross-validation for each number of PCs
for (i in num_pcs) {
  formula <- as.formula(paste0("scores ~ ", paste0("PC", 1:i, collapse = " + ")))
  model <- train(formula, data = filtered_categorized_combined_data, method = "lm",
                 trControl = trainControl(method = "cv", number = 10))
  
  # Store R² and RMSE for each model
  cv_results$R2[i] <- model$results$Rsquared
  cv_results$RMSE[i] <- model$results$RMSE
}

# View the results to choose the optimal number of PCs
print(cv_results)
##    PCs        R2      RMSE
## 1    1 0.0876854 0.8548791
## 2    2 0.1764262 0.8139189
## 3    3 0.1719754 0.8115201
## 4    4 0.2051458 0.7992826
## 5    5 0.1893706 0.8050483
## 6    6 0.1899893 0.8024347
## 7    7 0.1832296 0.8094571
## 8    8 0.1848796 0.8047957
## 9    9 0.1892157 0.8105372
## 10  10 0.1952686 0.8121673
## 11  11 0.1768508 0.8110399
## 12  12 0.1843039 0.8099173
## 13  13 0.1619256 0.8209299
## 14  14 0.1594518 0.8230262
## 15  15 0.1651181 0.8227623
## 16  16 0.1615417 0.8223863
## 17  17 0.1533038 0.8293044
## 18  18 0.1470195 0.8273820
## 19  19 0.1532944 0.8281546
## 20  20 0.1556271 0.8263788
ggplot(cv_results, aes(x = PCs)) +
  geom_line(aes(y = R2), color = "blue") +
  geom_line(aes(y = RMSE), color = "red") +
  ylab("Model Performance (R² / RMSE)") +
  ggtitle("Cross-Validation of PCs") +
  theme_minimal()

Regress out the influence of the ancestry PCS

# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
                                data = filtered_categorized_combined_data)

# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)

# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_combined_data$PC10 * ancestry_coefficients["PC10"])

Get adjusted probabilities

# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
  mutate(adjusted_probability = plogis(adjusted_scores))

Perform PC clustering

# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]

# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 4)  

# Extract the cluster labels from the GMM result
initial_clusters <- gmm_result$classification

# Select the data points that belong to Cluster 1
cluster1_data <- pca_columns[initial_clusters == 1, ]

# Perform GMM sub-clustering on Cluster 1
sub_gmm_result <- Mclust(cluster1_data, G = 2)  

# Integrate the sub-clustering results back into your overall clustering
final_clusters <- initial_clusters

# Assign new cluster labels to the points in Cluster 1 based on the sub-clustering
# We use +4 here to differentiate from the initial clusters (assuming there were 4 clusters initially)
final_clusters[initial_clusters == 1] <- sub_gmm_result$classification + 4


# Select the data points that belong to Cluster 2
cluster2_data <- pca_columns[initial_clusters == 2, ]

# Perform GMM sub-clustering on Cluster 2
sub_gmm_result2 <- Mclust(cluster2_data, G = 2) 

# Integrate the sub-clustering results back into your overall clustering
final_clusters[initial_clusters == 2] <- sub_gmm_result2$classification + 6  

Add cluster assignments to data

# Add the final cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(final_clusters)

# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = rainbow(length(unique(final_clusters)))) +
  theme(legend.position = "bottom")

contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)

contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")

# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Makeup of Each Cluster by Ancestry") +
  xlab("Cluster") + ylab("Count") +
  theme_minimal() +
  theme(legend.position = "bottom")

Plot the PCs, ancestry clusters informed by self-reported identities, and plot adjusted data

# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("6" = "EUR", "7" = "HIS", "5" = "AFR", "3" = "OTH", "4" = "OTH", "8" = "HIS")

# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)

# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])

# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))

# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
  group_by(Category, cluster_gmm_Ancestry) %>%
  summarise(count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Category'. You can override using the
## `.groups` argument.
# Define the palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")


# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
  xlab("Category") +
  ylab("Percentage") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = palette_colors) +
  geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
            position = position_fill(vjust = 0.5), color = "black") + 
  theme_cowplot(12)


# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
  geom_point() +
  ggtitle("GMM Clustering of PCA Data") +
  theme_minimal() +
  scale_color_manual(values = palette_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12)


# Generalized Plot for Scores
ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Generalized Plot for Adjusted Scores
ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12) +
  ylim(-3, 3)

# Adjusted Scores by Category (not ancestry-specific)
ancestry_adjusted_total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by Category",
       x = "Category",
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + 
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Cardiac Variant Scores
cardiac_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = cardiac_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Cardiac Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Cardiac Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Epilepsy Variant Scores
epilepsy_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = epilepsy_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Epilepsy Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Epilepsy Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Common Variant Scores
common_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = common_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Common Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Specific Plot for Noncoding Variant Scores
noncoding_variant_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = noncoding_variant_score, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Noncoding Variant Scores by cluster_gmm_Ancestry and Category",
       x = "cluster_gmm_Ancestry",
       y = "Noncoding Variant Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") +
  theme_cowplot(12) +
  ylim(-3, 3)

# Print Plots
clusters

ancestry_makeup

suppressWarnings(print(ancestry_scores))

suppressWarnings(print(ancestry_adjusted_scores))

suppressWarnings(print(ancestry_adjusted_total_scores))

suppressWarnings(print(cardiac_variant_scores))

suppressWarnings(print(epilepsy_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

suppressWarnings(print(common_variant_scores))

suppressWarnings(print(noncoding_variant_scores))
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

# Wilcoxon Test for Adjusted Scores
wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  adjusted_scores by Category
## W = 79303, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# Wilcoxon Test within each ancestry group (Adjusted Scores)
adjusted_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(adjusted_scores ~ Category)$p.value
  )
print(adjusted_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry       p_value
##   <fct>                        <dbl>
## 1 EUR                  0.00000000433
## 2 HIS                  0.0181       
## 3 AFR                  0.0000162    
## 4 OTH                  0.00548
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(scores ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry  p_value
##   <fct>                   <dbl>
## 1 EUR                  1.75e-12
## 2 HIS                  2.43e- 3
## 3 AFR                  1.17e- 7
## 4 OTH                  1.32e- 3

Interaction model to evaluate ancestry x PGS interactions

# Interaction model: Does ancestry modify the effect of the variables on case/control?
interaction_model <- glm(Category ~ Normalized_LVEF * cluster_gmm_Ancestry + 
                         Normalized_Heart_rate * cluster_gmm_Ancestry + 
                         Normalized_SV * cluster_gmm_Ancestry + 
                         Normalized_LVEDVI * cluster_gmm_Ancestry + 
                         Normalized_Afib * cluster_gmm_Ancestry + 
                         Normalized_PR_interval * cluster_gmm_Ancestry + 
                         Normalized_LVESV * cluster_gmm_Ancestry + 
                         Normalized_SVI * cluster_gmm_Ancestry + 
                         Normalized_BP * cluster_gmm_Ancestry + 
                         Normalized_QTc * cluster_gmm_Ancestry, 
                         data = categorized_combined_data, family = binomial)

summary(interaction_model)
## 
## Call:
## glm(formula = Category ~ Normalized_LVEF * cluster_gmm_Ancestry + 
##     Normalized_Heart_rate * cluster_gmm_Ancestry + Normalized_SV * 
##     cluster_gmm_Ancestry + Normalized_LVEDVI * cluster_gmm_Ancestry + 
##     Normalized_Afib * cluster_gmm_Ancestry + Normalized_PR_interval * 
##     cluster_gmm_Ancestry + Normalized_LVESV * cluster_gmm_Ancestry + 
##     Normalized_SVI * cluster_gmm_Ancestry + Normalized_BP * cluster_gmm_Ancestry + 
##     Normalized_QTc * cluster_gmm_Ancestry, family = binomial, 
##     data = categorized_combined_data)
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                     0.985996   0.189985   5.190
## Normalized_LVEF                                -0.056041   0.160970  -0.348
## cluster_gmm_AncestryHIS                        -2.618199   0.270454  -9.681
## cluster_gmm_AncestryAFR                        -1.501416   0.291432  -5.152
## cluster_gmm_AncestryOTH                         0.085666   0.347552   0.246
## Normalized_Heart_rate                          -0.148686   0.127560  -1.166
## Normalized_SV                                  -0.129167   0.144625  -0.893
## Normalized_LVEDVI                               0.099653   0.166314   0.599
## Normalized_Afib                                -0.119832   0.128547  -0.932
## Normalized_PR_interval                          0.233999   0.117628   1.989
## Normalized_LVESV                                0.006579   0.184338   0.036
## Normalized_SVI                                 -0.007414   0.176301  -0.042
## Normalized_BP                                   0.100302   0.128746   0.779
## Normalized_QTc                                  0.144560   0.129390   1.117
## Normalized_LVEF:cluster_gmm_AncestryHIS         0.161473   0.251740   0.641
## Normalized_LVEF:cluster_gmm_AncestryAFR        -0.057026   0.243657  -0.234
## Normalized_LVEF:cluster_gmm_AncestryOTH         0.149925   0.379303   0.395
## cluster_gmm_AncestryHIS:Normalized_Heart_rate   0.279794   0.205353   1.363
## cluster_gmm_AncestryAFR:Normalized_Heart_rate   0.248188   0.199291   1.245
## cluster_gmm_AncestryOTH:Normalized_Heart_rate   0.021320   0.282468   0.075
## cluster_gmm_AncestryHIS:Normalized_SV           0.018127   0.241476   0.075
## cluster_gmm_AncestryAFR:Normalized_SV          -0.005352   0.227930  -0.023
## cluster_gmm_AncestryOTH:Normalized_SV          -0.099748   0.367285  -0.272
## cluster_gmm_AncestryHIS:Normalized_LVEDVI      -0.059796   0.277594  -0.215
## cluster_gmm_AncestryAFR:Normalized_LVEDVI      -0.012314   0.242344  -0.051
## cluster_gmm_AncestryOTH:Normalized_LVEDVI       0.162975   0.409278   0.398
## cluster_gmm_AncestryHIS:Normalized_Afib        -0.063225   0.207940  -0.304
## cluster_gmm_AncestryAFR:Normalized_Afib         0.092741   0.189536   0.489
## cluster_gmm_AncestryOTH:Normalized_Afib         0.318674   0.279610   1.140
## cluster_gmm_AncestryHIS:Normalized_PR_interval -0.107243   0.195593  -0.548
## cluster_gmm_AncestryAFR:Normalized_PR_interval -0.006562   0.185089  -0.035
## cluster_gmm_AncestryOTH:Normalized_PR_interval  0.140872   0.354449   0.397
## cluster_gmm_AncestryHIS:Normalized_LVESV        0.611566   0.302932   2.019
## cluster_gmm_AncestryAFR:Normalized_LVESV        0.002846   0.265806   0.011
## cluster_gmm_AncestryOTH:Normalized_LVESV        0.084992   0.343738   0.247
## cluster_gmm_AncestryHIS:Normalized_SVI         -0.031809   0.270997  -0.117
## cluster_gmm_AncestryAFR:Normalized_SVI          0.088692   0.237651   0.373
## cluster_gmm_AncestryOTH:Normalized_SVI          0.064705   0.353658   0.183
## cluster_gmm_AncestryHIS:Normalized_BP           0.081335   0.195361   0.416
## cluster_gmm_AncestryAFR:Normalized_BP           0.286601   0.198513   1.444
## cluster_gmm_AncestryOTH:Normalized_BP          -0.185752   0.315792  -0.588
## cluster_gmm_AncestryHIS:Normalized_QTc         -0.256980   0.198895  -1.292
## cluster_gmm_AncestryAFR:Normalized_QTc         -0.261544   0.239132  -1.094
## cluster_gmm_AncestryOTH:Normalized_QTc         -0.395822   0.295921  -1.338
##                                                Pr(>|z|)    
## (Intercept)                                    2.10e-07 ***
## Normalized_LVEF                                  0.7277    
## cluster_gmm_AncestryHIS                         < 2e-16 ***
## cluster_gmm_AncestryAFR                        2.58e-07 ***
## cluster_gmm_AncestryOTH                          0.8053    
## Normalized_Heart_rate                            0.2438    
## Normalized_SV                                    0.3718    
## Normalized_LVEDVI                                0.5490    
## Normalized_Afib                                  0.3512    
## Normalized_PR_interval                           0.0467 *  
## Normalized_LVESV                                 0.9715    
## Normalized_SVI                                   0.9665    
## Normalized_BP                                    0.4359    
## Normalized_QTc                                   0.2639    
## Normalized_LVEF:cluster_gmm_AncestryHIS          0.5212    
## Normalized_LVEF:cluster_gmm_AncestryAFR          0.8150    
## Normalized_LVEF:cluster_gmm_AncestryOTH          0.6926    
## cluster_gmm_AncestryHIS:Normalized_Heart_rate    0.1730    
## cluster_gmm_AncestryAFR:Normalized_Heart_rate    0.2130    
## cluster_gmm_AncestryOTH:Normalized_Heart_rate    0.9398    
## cluster_gmm_AncestryHIS:Normalized_SV            0.9402    
## cluster_gmm_AncestryAFR:Normalized_SV            0.9813    
## cluster_gmm_AncestryOTH:Normalized_SV            0.7859    
## cluster_gmm_AncestryHIS:Normalized_LVEDVI        0.8295    
## cluster_gmm_AncestryAFR:Normalized_LVEDVI        0.9595    
## cluster_gmm_AncestryOTH:Normalized_LVEDVI        0.6905    
## cluster_gmm_AncestryHIS:Normalized_Afib          0.7611    
## cluster_gmm_AncestryAFR:Normalized_Afib          0.6246    
## cluster_gmm_AncestryOTH:Normalized_Afib          0.2544    
## cluster_gmm_AncestryHIS:Normalized_PR_interval   0.5835    
## cluster_gmm_AncestryAFR:Normalized_PR_interval   0.9717    
## cluster_gmm_AncestryOTH:Normalized_PR_interval   0.6910    
## cluster_gmm_AncestryHIS:Normalized_LVESV         0.0435 *  
## cluster_gmm_AncestryAFR:Normalized_LVESV         0.9915    
## cluster_gmm_AncestryOTH:Normalized_LVESV         0.8047    
## cluster_gmm_AncestryHIS:Normalized_SVI           0.9066    
## cluster_gmm_AncestryAFR:Normalized_SVI           0.7090    
## cluster_gmm_AncestryOTH:Normalized_SVI           0.8548    
## cluster_gmm_AncestryHIS:Normalized_BP            0.6772    
## cluster_gmm_AncestryAFR:Normalized_BP            0.1488    
## cluster_gmm_AncestryOTH:Normalized_BP            0.5564    
## cluster_gmm_AncestryHIS:Normalized_QTc           0.1963    
## cluster_gmm_AncestryAFR:Normalized_QTc           0.2741    
## cluster_gmm_AncestryOTH:Normalized_QTc           0.1810    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1370.0  on 992  degrees of freedom
## Residual deviance: 1091.5  on 949  degrees of freedom
## AIC: 1179.5
## 
## Number of Fisher Scoring iterations: 4

Now redo the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS

# List of Scores to Correct
score_list <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV", 
                "Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV", 
                "Normalized_Heart_rate", "Normalized_LVEF", "Epilepsy_rare", "Epilepsy_ultrarare", 
                "Cardiomyopathy_rare", "Cardiomyopathy_ultrarare", "PLP_Epilepsy", "PLP_Cardiomyopathy",
                "Cardiomyopathy_noncoding_rare", "Epilepsy_noncoding_rare", "Cardiomyopathy_null", "Epilepsy_null")

# Adjust Each Score in the Training Data 
for (score in score_list) {
  
  # Run the regression of the score on PC1 to PC10
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the coefficients from the regression
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Adjust the score by removing ancestry effects
  categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_combined_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_combined_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_combined_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_combined_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_combined_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}

# Fit Logistic Regression Using Adjusted Scores
adjusted_score_list <- paste0(score_list, "_ANCESTRY_adjusted")
model_ANCESTRY_adjusted <- glm(as.formula(paste("Category ~", paste(adjusted_score_list, collapse = " + "))),
                               data = categorized_combined_data, family = binomial())

# Predict Scores for Training Data
categorized_combined_data$Scores_ANCESTRY_adjusted <- predict(model_ANCESTRY_adjusted, type = "link")

# Plot Adjusted Scores
ancestry_ADJUSTED_pre_by_category <- ggplot(categorized_combined_data, aes(x = Category, y = Scores_ANCESTRY_adjusted, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Adjusted Scores by Category", x = "Category", y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

ancestry_ADJUSTED_pre_by_category
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ancestry_ADJUSTED_pre_by_ancestry <- ggplot(categorized_combined_data, 
                                            aes(x = cluster_gmm_Ancestry, 
                                                y = Scores_ANCESTRY_adjusted, 
                                                fill = Category, 
                                                pattern = Category)) +
  geom_boxplot_pattern(outlier.shape = NA, 
                       notch = TRUE, 
                       pattern_density = 0.1, 
                       pattern_fill = "black", 
                       pattern_spacing = 0.01, 
                       pattern_angle = 90) + 
  labs(title = "Adjusted Scores by Ancestry", 
       x = "Ancestry Cluster", 
       y = "Adjusted Scores") +
  scale_fill_manual(values = group_colors) +
  scale_pattern_manual(values = c("stripe", "crosshatch", "dot", "none")) + 
  theme(legend.position = "bottom") + 
  theme_cowplot(12) + 
  ylim(-3, 3)

ancestry_ADJUSTED_pre_by_ancestry
## Warning: Removed 35 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# Compute ROC and AUC for Training Data
categorized_combined_data <- categorized_combined_data %>%
  mutate(adjusted_probability = plogis(Scores_ANCESTRY_adjusted))

roc_result_full_PRE_ADJUSTED <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

print(paste("Training AUC:", auc(roc_result_full_PRE_ADJUSTED)))
## [1] "Training AUC: 0.758878107746088"
# Adjust Each Score in the Replication Data
for (score in score_list) {
  
# Run ancestry adjustment regression in the training set
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the ancestry correction coefficients
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Apply ancestry adjustment to the replication dataset
  categorized_replication_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_replication_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_replication_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_replication_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_replication_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_replication_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_replication_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_replication_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_replication_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_replication_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_replication_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_replication_data$PC10 * ancestry_coefficients["PC10"])
}

# Predict Using the Ancestry-Adjusted Replication Data
adjusted_score_list_replication <- paste0(score_list, "_ANCESTRY_adjusted")

# Predict using the model, ensuring we use ONLY adjusted scores
categorized_replication_data$Scores_ANCESTRY_adjusted_replication <- predict(model_ANCESTRY_adjusted, 
                                                                             newdata = categorized_replication_data[, adjusted_score_list_replication, drop = FALSE], 
                                                                             type = "link")

# Convert to Probabilities
categorized_replication_data <- categorized_replication_data %>%
  mutate(adjusted_probability_replication = plogis(Scores_ANCESTRY_adjusted_replication))

# Compute ROC and AUC for Replication Data
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, 
                                categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(paste("Replication AUC:", auc(roc_result_full_adjusted)))
## [1] "Replication AUC: 0.874778649127245"
plot(roc_result_full_adjusted, xlim = c(1, 0))

# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(Scores_ANCESTRY_adjusted ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry  p_value
##   <fct>                   <dbl>
## 1 EUR                  6.46e-16
## 2 HIS                  5.57e- 6
## 3 AFR                  8.54e- 9
## 4 OTH                  1.07e- 4

Now plot the datausing the ancestry corrections by adjusting the variable prior to fully incorporating into GAPS

# List of Common Variant Scores to Correct
common_variant_scores <- c("Normalized_QTc", "Normalized_BP", "Normalized_SVI", "Normalized_LVESV", 
                           "Normalized_PR_interval", "Normalized_Afib", "Normalized_LVEDVI", "Normalized_SV", 
                           "Normalized_Heart_rate", "Normalized_LVEF")

# Adjust Each Score in the Training Data
for (score in common_variant_scores) {
  
  # Run the regression of the score on PC1 to PC10
  ancestry_regression_model <- lm(as.formula(paste(score, "~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10")), 
                                  data = filtered_categorized_combined_data)
  
  # Get the coefficients from the regression
  ancestry_coefficients <- coef(ancestry_regression_model)
  
  # Adjust the score by removing ancestry effects
  categorized_combined_data[[paste0(score, "_ANCESTRY_adjusted")]] <- categorized_combined_data[[score]] - 
    (ancestry_coefficients["(Intercept)"] + 
     categorized_combined_data$PC1 * ancestry_coefficients["PC1"] + 
     categorized_combined_data$PC2 * ancestry_coefficients["PC2"] + 
     categorized_combined_data$PC3 * ancestry_coefficients["PC3"] + 
     categorized_combined_data$PC4 * ancestry_coefficients["PC4"] + 
     categorized_combined_data$PC5 * ancestry_coefficients["PC5"] + 
     categorized_combined_data$PC6 * ancestry_coefficients["PC6"] + 
     categorized_combined_data$PC7 * ancestry_coefficients["PC7"] + 
     categorized_combined_data$PC8 * ancestry_coefficients["PC8"] + 
     categorized_combined_data$PC9 * ancestry_coefficients["PC9"] + 
     categorized_combined_data$PC10 * ancestry_coefficients["PC10"])
}

# Fit Logistic Regression Using Adjusted Scores
adjusted_common_variant_scores <- paste0(common_variant_scores, "_ANCESTRY_adjusted")
model_common_variant_ADJUSTED_pre <- glm(as.formula(paste("Category ~", paste(adjusted_common_variant_scores, collapse = " + "))),
                            data = categorized_combined_data, family = binomial())

# Predict Common Variant Score
categorized_combined_data$Common_Variant_Score_ADJUSTED_pre <- predict(model_common_variant_ADJUSTED_pre, type = "link")

# Plot Common Variant Score by Category
common_variant_plot_by_category_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = Category, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Score by Category", x = "Category", y = "Common Variant Score") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

common_variant_plot_by_category_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

# Plot Common Variant Score by Ancestry
common_variant_plot_by_ancestry_ADJUSTED_pre <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = Common_Variant_Score_ADJUSTED_pre, fill = Category)) +
  geom_boxplot(outlier.shape = NA, notch = TRUE) +
  labs(title = "Common Variant Score by Ancestry", x = "Ancestry Cluster", y = "Common Variant Score") +
  scale_fill_manual(values = group_colors) +
  theme(legend.position = "bottom") + theme_cowplot(12) + ylim(-3, 3)

common_variant_plot_by_ancestry_ADJUSTED_pre
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  summarise(
    p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
  )
print(scores_wilcox_results)
##        p_value
## 1 5.584507e-20
# Wilcoxon Test within each ancestry group (Scores)
scores_wilcox_results <- categorized_combined_data %>%
  group_by(cluster_gmm_Ancestry) %>%
  summarise(
    p_value = wilcox.test(Common_Variant_Score_ADJUSTED_pre ~ Category)$p.value
  )
print(scores_wilcox_results)
## # A tibble: 4 × 2
##   cluster_gmm_Ancestry     p_value
##   <fct>                      <dbl>
## 1 EUR                  0.000000680
## 2 HIS                  0.0000165  
## 3 AFR                  0.000238   
## 4 OTH                  0.0857

Compute the odds ratios in the highest quintile across ancestry before and after the post-integration adjustments

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank_adjusted = rank(adjusted_scores, ties.method = "first"),
    score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
  )

#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]

# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)

combined_odds_summary
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_2_6  odds odds_ratio
##             <dbl>            <int>              <int> <dbl>      <dbl>
## 1               1              162                 36 0.222       1   
## 2               2              138                 61 0.442       1.99
## 3               3              116                 82 0.707       3.18
## 4               4               89                110 1.24        5.56
## 5               5               32                167 5.22       23.5 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_2_6  odds odds_ratio
##             <dbl>            <int>              <int> <dbl>      <dbl>
## 1               1               82                 21 0.256       1   
## 2               2               56                 18 0.321       1.26
## 3               3               38                 21 0.553       2.16
## 4               4               13                 13 1           3.90
## 5               5                4                 17 4.25       16.6 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_2_6  odds odds_ratio
##             <dbl>            <int>              <int> <dbl>      <dbl>
## 1               1               69                 10 0.145       1   
## 2               2               56                  9 0.161       1.11
## 3               3               41                 13 0.317       2.19
## 4               4               49                 17 0.347       2.39
## 5               5               15                  9 0.6         4.14
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_2_6  odds odds_ratio
##             <dbl>            <int>              <int> <dbl>      <dbl>
## 1               1                8                  2  0.25       1   
## 2               2               19                 24  1.26       5.05
## 3               3               29                 39  1.34       5.38
## 4               4               23                 69  3         12   
## 5               5               11                112 10.2       40.7 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH 
## # A tibble: 5 × 8
##   score_quintiles count_category_1 count_category_2_6  odds odds_ratio
##             <dbl>            <int>              <int> <dbl>      <dbl>
## 1               1                3                  3  1          1   
## 2               2                7                 10  1.43       1.43
## 3               3                8                  9  1.12       1.12
## 4               4                4                 11  2.75       2.75
## 5               5                2                 29 14.5       14.5 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier

combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)

combined_odds_summary_adjust
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1              134                 64 0.478       1   
## 2                        2              131                 68 0.519       1.09
## 3                        3              118                 80 0.678       1.42
## 4                        4              105                 94 0.895       1.87
## 5                        5               49                150 3.06        6.41
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1               52                 16 0.308      1    
## 2                        2               40                 11 0.275      0.894
## 3                        3               45                 16 0.356      1.16 
## 4                        4               45                 25 0.556      1.81 
## 5                        5               11                 22 2          6.5  
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1               56                  8 0.143       1   
## 2                        2               57                 11 0.193       1.35
## 3                        3               44                 13 0.295       2.07
## 4                        4               43                 17 0.395       2.77
## 5                        5               30                  9 0.3         2.1 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1               23                 30  1.30       1   
## 2                        2               27                 41  1.52       1.16
## 3                        3               20                 44  2.2        1.69
## 4                        4               14                 47  3.36       2.57
## 5                        5                6                 84 14         10.7 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6   odds odds_ratio
##                      <dbl>            <int>              <int>  <dbl>      <dbl>
## 1                        1                3                 10  3.33       1    
## 2                        2                7                  5  0.714      0.214
## 3                        3                9                  7  0.778      0.233
## 4                        4                3                  5  1.67       0.5  
## 5                        5                2                 35 17.5        5.25 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>

Extract the odds ratios by ancestry for plotting

# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary$odds_ratio[5],
                 combined_odds_summary_AFR$odds_ratio[5],
                 combined_odds_summary_HIS$odds_ratio[5],
                 combined_odds_summary_EUR$odds_ratio[5],
                 combined_odds_summary_OTH$odds_ratio[5]),
  lower_ci = c(combined_odds_summary$lower_ci[5],
               combined_odds_summary_AFR$lower_ci[5],
               combined_odds_summary_HIS$lower_ci[5],
               combined_odds_summary_EUR$lower_ci[5],
               combined_odds_summary_OTH$lower_ci[5]),
  upper_ci = c(combined_odds_summary$upper_ci[5],
               combined_odds_summary_AFR$upper_ci[5],
               combined_odds_summary_HIS$upper_ci[5],
               combined_odds_summary_EUR$upper_ci[5],
               combined_odds_summary_OTH$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "#3C8C78")

Now lets compute the odds ratios by 5th quintile and ancestry corrected post-integration of the variables

# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
                 combined_odds_summary_AFR_adjust$odds_ratio[5],
                 combined_odds_summary_HIS_adjust$odds_ratio[5],
                 combined_odds_summary_EUR_adjust$odds_ratio[5],
                 combined_odds_summary_OTH_adjust$odds_ratio[5]),
  lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
               combined_odds_summary_AFR_adjust$lower_ci[5],
               combined_odds_summary_HIS_adjust$lower_ci[5],
               combined_odds_summary_EUR_adjust$lower_ci[5],
               combined_odds_summary_OTH_adjust$lower_ci[5]),
  upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
               combined_odds_summary_AFR_adjust$upper_ci[5],
               combined_odds_summary_HIS_adjust$upper_ci[5],
               combined_odds_summary_EUR_adjust$upper_ci[5],
               combined_odds_summary_OTH_adjust$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "black")

Now lets compute the odds ratios by 5th quintile and ancestry corrected pre-integration of the variables

categorized_combined_data <- categorized_combined_data %>%
  mutate(
    rank_adjusted = rank(Scores_ANCESTRY_adjusted, ties.method = "first"),
    score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
  )

#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]

# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)

combined_odds_summary
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1              156                 42 0.269       1   
## 2                        2              141                 58 0.411       1.53
## 3                        3              123                 75 0.610       2.26
## 4                        4               88                111 1.26        4.69
## 5                        5               29                170 5.86       21.8 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1               56                 11 0.196       1   
## 2                        2               48                 11 0.229       1.17
## 3                        3               42                 19 0.452       2.30
## 4                        4               36                 23 0.639       3.25
## 5                        5               11                 26 2.36       12.0 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6   odds odds_ratio
##                      <dbl>            <int>              <int>  <dbl>      <dbl>
## 1                        1               64                  6 0.0938       1   
## 2                        2               66                 11 0.167        1.78
## 3                        3               54                 13 0.241        2.57
## 4                        4               38                 21 0.553        5.89
## 5                        5                8                  7 0.875        9.33
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6   odds odds_ratio
##                      <dbl>            <int>              <int>  <dbl>      <dbl>
## 1                        1               29                 15  0.517       1   
## 2                        2               22                 33  1.5         2.9 
## 3                        3               19                 39  2.05        3.97
## 4                        4               13                 59  4.54        8.77
## 5                        5                7                100 14.3        27.6 
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH 
## # A tibble: 5 × 8
##   score_adjusted_quintiles count_category_1 count_category_2_6  odds odds_ratio
##                      <dbl>            <int>              <int> <dbl>      <dbl>
## 1                        1                7                 10  1.43       1   
## 2                        2                5                  3  0.6        0.42
## 3                        3                8                  4  0.5        0.35
## 4                        4                1                  8  8          5.6 
## 5                        5                3                 37 12.3        8.63
## # ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
  group = c("Total","AFR", "HIS", "EUR", "OTH"),
  odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
                 combined_odds_summary_AFR_adjust$odds_ratio[5],
                 combined_odds_summary_HIS_adjust$odds_ratio[5],
                 combined_odds_summary_EUR_adjust$odds_ratio[5],
                 combined_odds_summary_OTH_adjust$odds_ratio[5]),
  lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
               combined_odds_summary_AFR_adjust$lower_ci[5],
               combined_odds_summary_HIS_adjust$lower_ci[5],
               combined_odds_summary_EUR_adjust$lower_ci[5],
               combined_odds_summary_OTH_adjust$lower_ci[5]),
  upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
               combined_odds_summary_AFR_adjust$upper_ci[5],
               combined_odds_summary_HIS_adjust$upper_ci[5],
               combined_odds_summary_EUR_adjust$upper_ci[5],
               combined_odds_summary_OTH_adjust$upper_ci[5])
)

# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)

# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6.5)

# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)", 
     main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)

# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci, 
       code = 3, angle = 90, length = 0.05, col = "black")

Plot the ROC for GAPS on the data adjusted for ancestry post-integration

# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

auc(roc_result_full_adjusted)
## Area under the curve: 0.7589

Plot the ROC for GAPS on the data adjusted for ancestry post-integration on the replication cohort

# Run the regression of the score on PC1 to PC10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, 
                                data = filtered_categorized_combined_data)

ancestry_coefficients <- coef(ancestry_regression_model)

# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication - 
                                             (ancestry_coefficients["(Intercept)"] + 
                                              categorized_replication_data$PC1 * ancestry_coefficients["PC1"] + 
                                              categorized_replication_data$PC2 * ancestry_coefficients["PC2"] + 
                                              categorized_replication_data$PC3 * ancestry_coefficients["PC3"] + 
                                              categorized_replication_data$PC4 * ancestry_coefficients["PC4"] + 
                                              categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
                                              categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
                                              categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
                                              categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
                                              categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
                                              categorized_replication_data$PC10 * ancestry_coefficients["PC10"] )


# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
  mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))


# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_result_full_adjusted)
## Area under the curve: 0.7918
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))

Perform iterations of 50(5-fold) validation of the discovery cohort and plot the dostribution of outcomes

# number of iterations
n_repeats <- 50

# Store results
auc_results <- data.frame()

for (i in 1:n_repeats) {
  set.seed(i)  

  # cross-validation method
  cv_folds <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary)

  # Train models again
  cv_model_cardiomyopathy_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, 
                                        data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_epilepsy_rare <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, 
                                  data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_noncoding_rare <- suppressWarnings(train(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, 
                                   data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_common <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                           data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_coding_rare <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, 
                                data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_epilepsy_and_common <- suppressWarnings(train(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                                        data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_cardiac_and_common <- suppressWarnings(train(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV, 
                                       data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  cv_model_common_and_coding <- suppressWarnings(train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null, 
                                      data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

cv_model_full <- suppressWarnings(
  train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
          Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + 
          Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
          PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
          Cardiomyopathy_null + Epilepsy_null, 
                                    data = categorized_combined_data, method = "glm", family = binomial, trControl = cv_folds, metric = "ROC"))

  # Store results for this iteration
  temp_results <- data.frame(
    Model = c("Cardiomyopathy Rare", "Epilepsy Rare", "Noncoding Rare", "Common Traits", "Coding Rare", 
              "Epilepsy & Common", "Cardiac & Common", "Common & Coding", "Common, Coding & Noncoding"),
    AUC = c(
      max(cv_model_cardiomyopathy_rare$results$ROC), 
      max(cv_model_epilepsy_rare$results$ROC), 
      max(cv_model_noncoding_rare$results$ROC), 
      max(cv_model_common$results$ROC), 
      max(cv_model_coding_rare$results$ROC), 
      max(cv_model_epilepsy_and_common$results$ROC), 
      max(cv_model_cardiac_and_common$results$ROC), 
      max(cv_model_common_and_coding$results$ROC),
      max(cv_model_full$results$ROC)
    ),
    Iteration = i
  )

  auc_results <- rbind(auc_results, temp_results)
}


# WA color palette for Millenial enjoyment
AUC_palette <- c(wes_palette("Royal1"), wes_palette("Royal2"))

# Boxplot AUC distribution
ggplot(auc_results, aes(x = Model, y = AUC, fill = Model)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = AUC_palette) +  # Apply custom palette
  theme_minimal() +
  labs(title = "AUC Distributions Across Models", x = "Model", y = "AUC") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_cowplot()

library(dplyr)

summary_stats <- auc_results %>%
  group_by(Model) %>%
  summarise(Median_AUC = median(AUC),
            IQR_AUC = IQR(AUC))

print(summary_stats)
## # A tibble: 9 × 3
##   Model                      Median_AUC IQR_AUC
##   <chr>                           <dbl>   <dbl>
## 1 Cardiac & Common                0.717 0.00474
## 2 Cardiomyopathy Rare             0.683 0.00300
## 3 Coding Rare                     0.689 0.00460
## 4 Common & Coding                 0.720 0.00697
## 5 Common Traits                   0.663 0.00671
## 6 Common, Coding & Noncoding      0.738 0.00610
## 7 Epilepsy & Common               0.684 0.00727
## 8 Epilepsy Rare                   0.640 0.00332
## 9 Noncoding Rare                  0.581 0.00289

Now down-sample 20 different times and re-run the 5-fold validations so that the ancestries match.

# Initialize a dataframe to store all AUCs
auc_results <- data.frame(
  Outer_Fold = integer(),
  Inner_Fold = integer(),
  AUC = numeric(),
  stringsAsFactors = FALSE
)

# Define number of outer folds
outer_folds <- 20
inner_folds <- 5

# Outer loop: 20 iterations
for (i in 1:outer_folds) {
  
  # Create a new randomly downsampled dataset (ensuring balanced case/control per ancestry)
  downsampled_data <- bind_rows(
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "zCase") %>% slice_sample(n = 90),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "EUR", Category == "Control") %>% slice_sample(n = 90),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "zCase") %>% slice_sample(n = 58),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "HIS", Category == "Control") %>% slice_sample(n = 58),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "zCase") %>% slice_sample(n = 90),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "AFR", Category == "Control") %>% slice_sample(n = 90),
    
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "zCase") %>% slice_sample(n = 24),
    categorized_combined_data %>% filter(cluster_gmm_Ancestry == "OTH", Category == "Control") %>% slice_sample(n = 24)
  )

  # Ensure binary outcome is properly coded
  downsampled_data$binary_outcome <- ifelse(downsampled_data$Category == "zCase", 1, 0)

  # Create 5 stratified folds **by both ancestry and category**
  folds <- downsampled_data %>%
    group_by(cluster_gmm_Ancestry, Category) %>%
    mutate(Fold = sample(rep(1:inner_folds, length.out = n()))) %>%
    ungroup()

  # Train and test across the 5 folds within this downsampled dataset
  inner_auc_values <- numeric(inner_folds)

  for (j in 1:inner_folds) {
    
    # Split into training and testing sets
    train_data <- folds %>% filter(Fold != j) %>% dplyr::select(-Fold)
    test_data  <- folds %>% filter(Fold == j) %>% dplyr::select(-Fold)
    
    # Train model
    cv_model <- suppressWarnings(
  train(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + 
          Normalized_LVESV + Normalized_SVI + Normalized_Afib + Normalized_LVEDVI + Normalized_SV + 
          Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + 
          PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + 
          Cardiomyopathy_null + Epilepsy_null, 
        data = train_data, method = "glm", family = binomial)
)
 
    # Get predicted probabilities for the positive class (zCase)
    test_data$predicted_prob <- predict(cv_model, test_data, type = "prob")[, "zCase"]

    # Compute ROC curve and AUC
    roc_result <- roc(test_data$binary_outcome, test_data$predicted_prob)
    inner_auc_values[j] <- auc(roc_result)
    
    # Store the AUC result
    auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = j, AUC = inner_auc_values[j]))
  }
  
  # Store the mean AUC from the 5 inner folds for this outer iteration
  auc_results <- rbind(auc_results, data.frame(Outer_Fold = i, Inner_Fold = NA, AUC = mean(inner_auc_values)))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Ensure Inner_Fold is treated as character for filtering
auc_results <- auc_results %>%
  mutate(Inner_Fold = as.character(Inner_Fold))

# Extract inner fold AUCs for the histogram
inner_auc_data <- auc_results %>%
  filter(Inner_Fold != "Mean")  # Exclude previous Mean entries if they exist

# Compute mean AUC per outer fold
outer_auc_means <- inner_auc_data %>%
  group_by(Outer_Fold) %>%
  summarise(AUC = mean(AUC)) %>%
  ungroup()

# Ensure AUC is numeric
inner_auc_data$AUC <- as.numeric(inner_auc_data$AUC)
outer_auc_means$AUC <- as.numeric(outer_auc_means$AUC)

# Print to verify correct means
print(outer_auc_means)
## # A tibble: 20 × 2
##    Outer_Fold   AUC
##         <int> <dbl>
##  1          1 0.668
##  2          2 0.676
##  3          3 0.665
##  4          4 0.676
##  5          5 0.699
##  6          6 0.686
##  7          7 0.678
##  8          8 0.684
##  9          9 0.675
## 10         10 0.682
## 11         11 0.652
## 12         12 0.682
## 13         13 0.673
## 14         14 0.604
## 15         15 0.669
## 16         16 0.672
## 17         17 0.676
## 18         18 0.682
## 19         19 0.681
## 20         20 0.676

Now plot the distribution of all the down-sampled, ancestry-matched AUC results, layering the means of each outer fold on top

# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(downsampled_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
  xlab("Category") +
  ylab("Percentage") +
  theme(legend.position = "bottom") +
  scale_fill_manual(values = palette_colors) +
  theme_cowplot(12)


ancestry_makeup

# Create histogram with cowplot styling
auc_histogram <- ggplot(inner_auc_data, aes(x = AUC)) +
  geom_histogram(binwidth = 0.01, fill = "lightblue", color = "black") +  
    geom_vline(data = outer_auc_means, aes(xintercept = AUC), color = "black", linetype = "dashed", size = 0.5) + 
  geom_vline(xintercept = 0.5, color = "red", linetype = "solid", size = 1) +
  xlab("AUC") +
  ylab("Frequency") +
  ggtitle("Histogram of Inner Fold AUCs with Outer Fold Means") +
  theme_cowplot(12)

auc_histogram

# Compute mean of outer fold AUCs
mean_outer_auc <- outer_auc_means %>%
  summarise(Mean_AUC = mean(AUC)) %>%
  pull(Mean_AUC)

# Print the mean AUC value
print(paste("Mean AUC across outer folds:", mean_outer_auc))
## [1] "Mean AUC across outer folds: 0.672768927539058"
# Load the reporter data
reporter_data <- read.delim("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Reporter_analysis.txt", check.names = FALSE)

reporter_long <- reporter_data %>%
  pivot_longer(cols = everything(), names_to = "Condition", values_to = "Activity")

library(ggplot2)

ggplot(reporter_long, aes(x = Condition, y = Activity, fill = Condition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.6, color = "black") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  ylab("Reporter Activity") +
  xlab("") +
  coord_cartesian(ylim = c(0, max(reporter_long$Activity, na.rm = TRUE) * 1.1)) +
  ggtitle("Reporter Assay Activity by Condition")